In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import combinations
from scipy.stats import ttest_ind

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

stud = pd.read_csv('stud_math.csv')

In [2]:
# Objective of the project: Data processing and analysis to determine the parameters of the future model

In [3]:
# Evaluating the initial data

In [None]:
display(stud.head(10))
stud.info()

In [None]:
stud.columns

In [5]:
# Bringing Column Names to a Single Standard

In [None]:
rename(columns={'Pstatus': 'p_status', 'Medu': 'm_edu', 'Fedu': 'f_edu',
                'Mjob': 'm_job', 'Fjob': 'f_job', 'schoolsup': 'school_sup', 'famsup': 'fam_sup',
                'famrel': 'fam_rel', 'studytime, granular': 'studytime_granular'})

In [6]:
# The number of data gaps is no more than 12%. Replacing them will not have a significant impact on the result.

In [7]:
# In nominative columns, we replace gaps with mods

In [None]:
columns_obj_list = stud.select_dtypes(exclude=[np.number]).columns
for column in columns_obj_list:
    mode_column = stud[column].mode()[0]
    stud[column] = stud[column].apply(
        lambda x: mode_column if str(x) == 'nan' else x)
stud.info()

In [8]:
# In numeric columns (except: 'absences', 'score'), we replace gaps with mods

In [None]:
columns_num_list = ['m_edu', 'f_edu', 'traveltime', 'studytime', 'failures',
                    'studytime_granular', 'fam_rel', 'freetime', 'goout', 'health']
for column in columns_num_list:
    mode_column = stud[column].mode()[0]
    stud[column] = stud[column].apply(
        lambda x: mode_column if str(x) == 'nan' else x)
stud.info()

In [9]:
# Replace erroneous data in two columns with mods

In [None]:
stud.f_edu = stud.f_edu.apply(lambda x: 2.0 if x == 40.0 else x)

In [None]:
stud.fam_rel = stud.fam_rel.apply(lambda x: 4.0 if x == -1.0 else x)

In [10]:
# We leave the data by the module in 'studytime_granular'

In [None]:
stud.studytime_granular = stud.studytime_granular.apply(lambda x: x*(-1))

In [11]:
# check 'absences'

In [None]:
pd.DataFrame(stud.absences.value_counts())

In [12]:
# # In 'absences', change outliers to medians

In [None]:
stud.absences = stud.absences.apply(
    lambda x: stud.absences.median() if x == 385.0 else x)

In [None]:
stud.absences = stud.absences.apply(
    lambda x: stud.absences.median() if x == 212.0 else x)

In [13]:
# In 'absences', instead of gaps, set the mean

In [None]:
columns_absences = ['absences']
for column in columns_absences:
    mean_column = stud[column].mean()
    stud[column] = stud[column].apply(
        lambda x: mean_column if str(x) == 'nan' else x)
stud.info()

In [14]:
# check 'score'

In [None]:
pd.DataFrame(stud.score.value_counts())

In [None]:
stud.score.hist()
stud.score.describe()

In [15]:
# check that someone with a score of 0 has other data and this is not an error

In [None]:
pivot = stud.loc[stud['score'].isin([0])].pivot_table
display(pivot)

In [16]:
# In 'score', instead of gaps, set the mean

In [None]:
columns_score = ['score']
for column in columns_score:
    mean_column = stud[column].mean()
    stud[column] = stud[column].apply(
        lambda x: mean_column if str(x) == 'nan' else x)
stud.info()

In [17]:
# The data is in order.

In [18]:
# We do correlation analysis

In [None]:
sns.pairplot(stud, kind = 'reg')

In [None]:
stud.corr()

In [19]:
# conclusions: have the maximum impact on the score - positively ('m_edu', 'age', 'f_edu', 'studytim'e), negatively ('goout')

In [20]:
# # Analyze nominative variables (Student's t-test)

In [None]:
def get_stat_dif(column):
    cols = stud.loc[:, column].value_counts().index[:10]
    combinations_all = list(combinations(cols, 2))
    for comb in combinations_all:
        if ttest_ind(stud.loc[stud.loc[:, column] == comb[0], 'score'],
                     stud.loc[stud.loc[:, column] == comb[1], 'score']).pvalue \
                <= 0.05/len(combinations_all):  # Учли поправку Бонферони
            print('Найдены статистически значимые различия для колонки', column)
            break

In [None]:
for col in ['school', 'sex', 'address', 'famsize', 'p_status', 'm_job', 'f_job',
            'reason', 'guardian', 'school_sup', 'fam_sup', 'paid', 'activities',
            'nursery', 'higher', 'internet', 'romantic']:
    get_stat_dif(col)

In [21]:
# Found statistically significant differences for 'address', 'm_job', 'higher', 'romantic'

In [None]:
# Outcomes:
# 1. The data is in order and is now suitable for analysis (the data is of sufficient quality)
# 2. By means of the correlation analysis of the analysis of the nominative data,
# the main indicators are determined that affect the 'score'.
# 3. These variables can be used to build a future model:
# 'age', 'm_edu', 'f_edu', 'studytime', 'goout', 'address', 'm_job', 'higher', 'romantic'

In [None]:
stud_for_model = stud.loc[:, ['age', 'm_edu', 'f_edu', 'studytime',
                              'goout', 'address', 'm_job', 'higher', 'romantic', 'score']]
stud_for_model.head(10)