In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix
from IPython.core.display_functions import display
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans

pd.options.display.float_format = '{:.3f}'.format

In [None]:
df = pd.read_excel("./data/COVID-19-Constructed-Dataset-(PANEL).xlsx")  #read csv file and store in df
df

# 3. Preprocessing
### 3-1. Categorical Data change to Numeric data

In [None]:
# Categorical Data change to Numeric Data
# 0 = Female, 1 = Male
encoder = OrdinalEncoder(dtype=np.int64)
X = pd.DataFrame(df['gender'])
encoder.fit(X)

# Deep copy
data_select = df.copy()
data_select['gender'] = pd.DataFrame(encoder.transform(X))

# Verify the data has been changed appropriately
data_select

In [None]:
# timeperiod column name change 'online'
# divide online , NotOnline
# 0 = notOnline, 1 = online
for i in range(len(data_select)):
    sumOnline = 0
    sumNonOnline = 0
    if data_select.loc[i, 'timeperiod'] < 3:
        data_select.loc[i, 'timeperiod'] = 0
    else:
        data_select.loc[i, 'timeperiod'] = 1

# dataFrame column name change
data_select.rename(columns={'timeperiod': 'online'}, inplace=True)

# Display data for appropriately checking
data_select

### 3-2. Deriving Features from Existing Features
#### 3-2-1. Make new feature from some features

In [None]:
# Find the average of the total subject grades
data_select['total'] = (data_select['readingscore'] + data_select['writingscore'] + data_select[
    'mathscore'] + data_select['readingscoreSL'] + data_select['writingscoreSL'] + data_select[
                            'mathscoreSL']) / 6

# Each grade data is dropped because the average of the total grade was obtained
dropCol = ['readingscore', 'writingscore', 'mathscore', 'readingscoreSL', 'writingscoreSL', 'mathscoreSL', 'covidpos']
data_select = data_select.drop(columns=dropCol)

data_select

In [None]:
# Create a data frame to store grade data according to student data (notOnine, Online)
data_transform = (data_select.iloc[:, :11]).copy()
data_transform.drop_duplicates(['studentID'], inplace=True)  # Drop rows with duplicate student IDs
data_transform.reset_index(inplace=True)  # index reset
data_transform.drop(columns=['index'], inplace=True)  # index column 제거

# Calculate the average of grades per student (notOnine, Online)
for i in range(1, len(data_transform) + 1):
    condition0 = (data_select.studentID == i) & (data_select.online == 0)
    condition1 = (data_select.studentID == i) & (data_select.online == 1)
    temp0 = data_select.loc[condition0]
    temp1 = data_select.loc[condition1]
    data_transform.loc[i - 1, "totalNotOnline"] = (temp0.loc[:, "total"].sum() / 3)
    data_transform.loc[i - 1, "totalOnline"] = (temp1.loc[:, "total"].sum() / 3)
    data_transform.loc[i - 1, "differ"] = data_transform.loc[i - 1, "totalNotOnline"] - data_transform.loc[
        i - 1, "totalOnline"]

# Drop onilne column because not using
data_transform = data_select.drop(columns="online")

# Display data for appropriately checking
data_transform

In [None]:
data_transform.describe()

#### 3-2-2. Outlier Handling

In [None]:
def outliars(data, column):
    Q1 = np.percentile(data[column], 25)
    Q3 = np.percentile(data[column], 75)

    IQR = Q3 - Q1
    outlier_step = 1.5 * IQR

    outliers_index = data[(data[column] < Q1 - outlier_step) | (data[column] > Q3 + outlier_step)].index
    return outliers_index

In [None]:
#price outlier handling
outliers_index = outliars(data_transform, "differ")

data_transform.drop(outliers_index, inplace=True)
data_transform.reset_index(drop=True, inplace=True)
data_transform.drop(data_transform[data_transform['differ'] == 0].index, inplace=True)
data_transform.reset_index(drop=True, inplace=True)

In [None]:
data_transform.describe()

#### 3-2-3. Make new feature from 'total' column using clustering

In [None]:
# K-mean Clustering + Scaling
def clustering_scaling(data, i, k, scaler, relationCol, clusterName, xCol, yCol):
    data_scale = scaler.fit_transform(data.loc[:, relationCol])  #data scaling
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(data_scale)  # learning scaling data
    data[clusterName] = model.fit_predict(data_scale)  # save each clustering data

    plt.figure(figsize=(8, 8))

    for i in range(k):
        plt.scatter(data.loc[data[clusterName] == i, xCol], data.loc[data[clusterName] == i, yCol],
                    label=clusterName + str(i))

    # make plot graph
    plt.legend()
    plt.title(str(scaler))
    plt.xlabel('totalNotOnline', size=12)
    plt.ylabel('totalOnline', size=12)

    return plt, data.groupby(clusterName).size()

In [None]:
data_transform_std = data_transform.copy()
data_transform_robust = data_transform.copy()
data_transform_minmax = data_transform.copy()
data_transform_list = [data_transform_std, data_transform_robust, data_transform_minmax]

scaler_list = [StandardScaler(), RobustScaler(), MinMaxScaler()]

for i in range(3):
    plt, groupData = clustering_scaling(data_transform_list[i], i, 3, scaler_list[i],
                                        ["totalNotOnline", "totalOnline", "differ"], "level",
                                        'totalNotOnline', 'differ')

    plt.show()
    print(groupData)

In [None]:
# Scaler + SelectKBest
def selectKBest_scaling(data, scaler, xCol, yCol):
    x = data.loc[:, xCol]  # select using columns
    y = data.loc[:, yCol]  # level
    scalerTemp = pd.DataFrame(scaler.fit_transform(x))

    scalerTemp.columns = [x.columns]

    bestfeatures = SelectKBest(score_func=f_regression, k=5)
    fit = bestfeatures.fit(x, y)
    dfcolumns = pd.DataFrame(x.columns)
    dfscores = pd.DataFrame(fit.scores_)

    featureScores = pd.concat([dfcolumns, dfscores], axis=1)
    featureScores.columns = ['Features', 'Score']

    return featureScores.nlargest(10, 'Score')

In [None]:
x_columns = ["school", "gender", "covidpos", "householdincome", "freelunch", "numcomputers", "familysize", "fathereduc", "mothereduc", "totalNotOnline", "totalOnline"]
for i in range(3):
    print(scaler_list[i])
    print(selectKBest_scaling(data_transform_list[i], scaler_list[i], x_columns, "level"))
    print()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(data=data_transform.corr(), annot=True,
            fmt='.2f', linewidths=.5, cmap='Blues')

# 4. Modeling
### 4-1. Linear Regression each scaling & Evaluation

In [None]:
def linearRegScaling(scaler, k, data, testSize, largeColumns, target):
    data_scale = scaler.fit_transform(data.loc[:, largeColumns])  #data scaling

    model = KMeans(n_clusters=k, random_state=42)
    model.fit(data_scale)  # learning scaling data
    data['predict'] = model.fit_predict(data_scale)  # save each clustering data

    x = data_scale  # school, totalNotOnline columns
    y = data.loc[:, target]  # level columns

    reg = linear_model.LinearRegression()

    # Split the dataset into training and testing
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=testSize, random_state=42, shuffle=True)
    reg.fit(x_train, y_train)

    # Compute data and find result
    resultTrainScore = reg.score(x_train, y_train)
    resultTestScore = reg.score(x_test, y_test)

    # data collect
    return resultTrainScore, resultTestScore

In [None]:
# Split the dataset train:test & scaler
split = [0.1, 0.2, 0.3]
scaler = [StandardScaler(), RobustScaler(), MinMaxScaler()]
data_transform_list = [data_transform_std, data_transform_robust, data_transform_minmax]
largeColumns1 = ("school", "totalNotOnline")

# Create new empty data frame
resultBestScaler = pd.DataFrame(index=range(0, 9),
                                columns=["Scaler", "Train", "Test", "TrainSet Score", "TestSet Score"])

j = 0
for i in range(len(scaler)):
    for k in range(len(split)):
        resultTrainScore, resultTestScore = linearRegScaling(scaler[i], 3, data_transform_list[i], split[k],
                                                             largeColumns1, "level")
        resultBestScaler.iloc[j] = [str(scaler[i]), str(1 - split[k]), str(split[k]), resultTrainScore, resultTestScore]
        j += 1

display(resultBestScaler)

In [None]:
# Split the dataset train:test & scaler
split = [0.1, 0.2, 0.3]
scaler = [StandardScaler(), RobustScaler(), MinMaxScaler()]

# Create new empty data frame
resultBestScaler = pd.DataFrame(index=range(0, 9),
                                columns=["Scaler", "Train", "Test", "TrainSet Score", "TestSet Score"])

largeColumns2 = ("school", "totalNotOnline", "householdincome")
j = 0
for i in range(len(scaler)):
    for k in range(len(split)):
        resultTrainScore, resultTestScore = linearRegScaling(scaler[i], 3, data_transform_list[i], split[k],
                                                             largeColumns2, "level")
        resultBestScaler.iloc[j] = [str(scaler[i]), str(1 - split[k]), str(split[k]), resultTrainScore, resultTestScore]
        j += 1

display(resultBestScaler)

#### -> Best Score is RobustScaler Result
<br><hr><br>

### 4-2. Model Evaluation Metrics for Regression

In [None]:
# Split the dataset into 5 subsets of equal size
def modelEvaluation(type, largeColumns, target, testSize):
    data_scale = data_transform_list[type].loc[:, largeColumns]  #data scaling

    x = scaler_list[type].fit_transform(data_scale)  # "totalNotOnline", "totalOnline" columns data
    y = data_transform_list[type].loc[:, target]  # level columns

    reg = linear_model.LinearRegression()
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=testSize, random_state=42, shuffle=True)
    reg.fit(x_train, y_train)
    y_pred = pd.DataFrame(reg.predict(x_test))

    mae = metrics.mean_absolute_error(y_test, y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(metrics.mean_absolute_error(y_test, y_pred))

    return mae, mse, rmse

In [None]:
# Create new empty data frame
evaluationMetrix = pd.DataFrame(index=range(0, 3),
                                columns=["Scaler", "MAE", "MSE", "RMSE"])

for i in range(3):  # StandardScaler, RobustScaler, MinMaxScaler
    mae, mse, rmse = modelEvaluation(i, largeColumns1, "level", 0.2)
    evaluationMetrix.iloc[i, :] = [scaler_list[i], mae, mse, rmse]  # Dataframe 데이터 추가

display(evaluationMetrix)

In [None]:
# Create new empty data frame
evaluationMetrix = pd.DataFrame(index=range(0, 3),
                                columns=["Scaler", "MAE", "MSE", "RMSE"])

for i in range(3):  # StandardScaler, RobustScaler, MinMaxScaler
    mae, mse, rmse = modelEvaluation(i, largeColumns2, "level", 0.2)
    evaluationMetrix.iloc[i, :] = [scaler_list[i], mae, mse, rmse]  # Dataframe 데이터 추가

display(evaluationMetrix)

# 5. Learning model evaluation and analysis
### 5-1. Confusion Matrix

In [None]:
confusion_matrix(data_transform_robust.loc[:, "level"], data_transform_robust.loc[:, "predict"])

In [None]:
data_confusion = {"Actual": data_transform_robust.loc[:, "level"],
                  "Predict": data_transform_robust.loc[:, "predict"]}
dataframe_confusion = pd.DataFrame(data_confusion, columns=["Actual", "Predict"])
confusion_matrix = pd.crosstab(dataframe_confusion["Actual"], dataframe_confusion["Predict"])

sns.heatmap(confusion_matrix, annot=True, fmt='.0f', linewidths=.5, cmap='Blues')

In [None]:
def precisionRecall(matrix, column):
    precision = np.zeros(len(column))
    recall = np.zeros(len(column))
    for i in range(len(column)):
        precision[i] += matrix.iloc[i, i] / matrix.iloc[:, i].sum()
        recall[i] += matrix.iloc[i, i] / matrix.iloc[i, :].sum()

    return precision, recall

In [None]:
y_true = np.array(data_transform_robust["level"])
y_pred = np.array(data_transform_robust["predict"])
confusion_matrix_result = pd.DataFrame(confusion_matrix, columns=[0, 1, 2], index=[0, 1, 2])
precision, recall = precisionRecall(confusion_matrix_result, confusion_matrix.columns)

print("Precision (0): %.2f" % precision[0])
print("Precision (1): %.2f" % precision[1])
print("Precision (2): %.2f" % precision[2])
print("Recall (0): %.2f" % recall[0])
print("Recall (1): %.2f" % recall[1])
print("Recall (2): %.2f" % recall[2])