In [485]:
from sklearn.datasets import make_regression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, RobustScaler, Normalizer, QuantileTransformer, PowerTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from IPython.display import display, HTML
import warnings
# warnings.filterwarnings('ignore')

In [486]:
def extracting(df):
    # Khởi tạo Label Encoder
    le = LabelEncoder()
    df['author'] = le.fit_transform(df['author'])
    df['language'] = le.fit_transform(df['language'])
    df['publisher'] = le.fit_transform(df['publisher'])
    df['page_format'] = le.fit_transform(df['page_format'])

    df.drop(['link', 'title'], axis=1, inplace=True)

    df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x))
    df['genres'] = df['genres'].apply(lambda x: sorted(x))
    df['genres'] = df['genres'].apply(lambda x: ','.join(x))
    df['genres'] = le.fit_transform(df['genres'])

    df['ratings_count'] = df['ratings_count'].apply(lambda x: int(x.replace(',', '')))
    df['reviews_count'] = df['reviews_count'].apply(lambda x: int(x.replace(',', '')))
    df['num_pages'] = df['num_pages'].astype(float)
    df['publish_year'] = df['publish_year'].astype(float)
    
    return df

In [487]:
def fillMissingData(data_train, data_val, data_test):
    eod_columns = ['publish_year', 'num_pages']
    eod_values = [data_train[col].mean() + 3*data_train[col].std() for col in eod_columns]
    for col, val in zip(eod_columns, eod_values):
        data_train[col].fillna(val, inplace=True)
        data_val[col].fillna(val, inplace=True)
        data_test[col].fillna(val, inplace=True)
    return data_train, data_val, data_test

In [488]:
def plotDistribution(data, columns):
    fig, axes = plt.subplots(1, len(columns), figsize=(20, 5))
    axes = axes.flatten()
    for col, ax in zip(columns, axes):
        ax.hist(data[col], bins=50)
        ax.set_title(col)
    plt.show()

def plotCorrelation(data, columns):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.matshow(data[columns].corr())
    plt.xticks(range(len(columns)), columns, rotation=90)
    plt.yticks(range(len(columns)), columns)
    plt.show()

def plotScatter(data, columns, target):
    fig, axes = plt.subplots(1, len(columns), figsize=(20, 5))
    for col, ax in zip(columns, axes):
        ax.scatter(data[col], data[target])
        ax.set_title(col)
    plt.show()

def plotBar(data, columns):
    fig, axes = plt.subplots(1, len(columns), figsize=(20, 5))
    for col, ax in zip(columns, axes):
        ax.bar(data[col].value_counts().index, data[col].value_counts())
        ax.set_title(col)
    plt.show()

def plotPie(data, columns):
    fig, axes = plt.subplots(1, len(columns), figsize=(20, 5))
    for col, ax in zip(columns, axes):
        ax.pie(data[col].value_counts(), labels=data[col].value_counts().index, autopct='%1.1f%%')
        ax.set_title(col)
    plt.show()

def plotHeatmap(data, columns):
    fig, ax = plt.subplots(figsize=(10, 10))
    ax.matshow(data[columns].corr())
    plt.xticks(range(len(columns)), columns, rotation=90)
    plt.yticks(range(len(columns)), columns)
    plt.show()

def plotHistogram(data, columns):
    fig, axes = plt.subplots(1, len(columns), figsize=(20, 5))
    for col, ax in zip(columns, axes):
        ax.hist(data[col], bins=30)
        ax.set_title(col)
    plt.show()

def plotCount(data, columns):
    fig, axes = plt.subplots(1, len(columns), figsize=(20, 5))
    for col, ax in zip(columns, axes):
        ax.bar(data[col].value_counts().index, data[col].value_counts())
        ax.set_title(col)
    plt.show()

def plotBox(data, columns):
    fig, axes = plt.subplots(1, len(columns), figsize=(20, 5))
    for col, ax in zip(columns, axes):
        ax.boxplot(data[col])
        ax.set_title(col)
    plt.show()

In [489]:
def printDescription(data, columns):
    for col in columns:
        print(data[col].describe())

In [490]:
skewedDists = ['ratings_count', 'reviews_count', 'num_pages', 'language']
isUpper_bridge = [True, True, True, True]
skewedDists = {col: isUpper_bridge for col, isUpper_bridge in zip(skewedDists, isUpper_bridge)}
gaussionDists = ['publish_year']
isUpper_boundary = [False]
gaussionDists = {col: isUpper_boundary for col, isUpper_boundary in zip(gaussionDists, isUpper_boundary)}

In [491]:
def outliers_handle(data, variable, isShow=False):
    upper_boundary=data[variable].mean() + 3 * data[variable].std()
    lower_boundary=data[variable].mean() - 3 * data[variable].std()
    if isShow:
        print(f"upper_boundary: {upper_boundary}")
        print(f"lower_boundary: {lower_boundary}")
        print(f"mean: {data[variable].mean()}")
    return lower_boundary, upper_boundary

def outliers_handle_skewed(data, variable, isShow=False):
    IQR=data[variable].quantile(0.75) - data[variable].quantile(0.25)
    lower_bridge=data[variable].quantile(0.25) - (IQR*3)
    upper_bridge=data[variable].quantile(0.75) + (IQR*3)
    if isShow:
        print(f"lower_bridge: {lower_bridge}")
        print(f"upper_bridge: {upper_bridge}")
    return lower_bridge, upper_bridge

In [492]:
minMaxScaler = MinMaxScaler()
maxAbsScaler = MaxAbsScaler()
standardScaler = StandardScaler()
robustScaler = RobustScaler()
normalizer = Normalizer()
quantileTransformer = QuantileTransformer(n_quantiles=640)
powerTransformer = PowerTransformer()
scalerArr = [minMaxScaler, maxAbsScaler, standardScaler, robustScaler, normalizer, quantileTransformer, powerTransformer]

In [493]:
def scaleData(data_train, data_val, data_test, scaler):
    scaler.fit(data_train)
    data_train = pd.DataFrame(scaler.transform(data_train.copy()), columns=data_train.columns)
    data_val = pd.DataFrame(scaler.transform(data_val.copy()), columns=data_val.columns)
    data_test = pd.DataFrame(scaler.transform(data_test.copy()), columns=data_test.columns)
    return data_train, data_val, data_test

In [494]:
def handleOutlier(data_train, data_val, data_test):
    for col in skewedDists:
        lower_bridge, upper_bridge = outliers_handle_skewed(data_train, col)
        if skewedDists[col]:
            data_train.loc[data_train[col]>=upper_bridge,col]=upper_bridge
            data_val.loc[data_val[col]>=upper_bridge,col]=upper_bridge
            data_test.loc[data_test[col]>=upper_bridge,col]=upper_bridge
        else:
            data_train.loc[data_train[col]<=lower_bridge,col]=lower_bridge
            data_val.loc[data_val[col]<=lower_bridge,col]=lower_bridge
            data_test.loc[data_test[col]<=lower_bridge,col]=lower_bridge
        
    for col in gaussionDists:
        lower_boundary, upper_boundary = outliers_handle(data_train, col)
        if gaussionDists[col]:
            data_train.loc[data_train[col]>=upper_boundary,col]=upper_boundary
            data_val.loc[data_val[col]>=upper_boundary,col]=upper_boundary
            data_test.loc[data_test[col]>=upper_boundary,col]=upper_boundary
        else:
            data_train.loc[data_train[col]<=lower_boundary,col]=lower_boundary
            data_val.loc[data_val[col]<=lower_boundary,col]=lower_boundary
            data_test.loc[data_test[col]<=lower_boundary,col]=lower_boundary
    return data_train, data_val, data_test

In [495]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

def featureSelection(X, y): 
    X_kbest = SelectKBest(f_classif, k = 5).fit(X, y)
    ix = X_kbest.get_support() 
    data = pd.DataFrame(X_kbest.transform(X), columns = X.columns.values[ix])
    # print(data.columns.values)
    return data.columns.values

In [496]:
def preprocessor(X_train, X_val, X_test):
    X_train, X_val, X_test = fillMissingData(X_train, X_val, X_test)
    X_train, X_val, X_test = handleOutlier(X_train, X_val, X_test)
    X_train, X_val, X_test = scaleData(X_train, X_val, X_test, powerTransformer)
    return X_train, X_val, X_test

In [497]:
def selectFeatures(X_train, y_train, X_val, X_test):
    featureArray = featureSelection(X_train, y_train)
    X_train = X_train[featureArray]
    X_val = X_val[featureArray]
    X_test = X_test[featureArray]
    return X_train, X_val, X_test

In [498]:
def proccessing(file_path):
    df = pd.read_csv(file_path)

    # check for doublications
    print(f"Number of duplicated rows: {df.duplicated().any().sum()}")
    if df.duplicated().any().sum() > 0:
        df.drop_duplicates(inplace=True)

    # check for missing values
    print(f"Number of missing values: \n{df.isnull().sum()}")

    df = extracting(df)

    data = df.copy()
    y = data['avg_ratings']
    X = data.drop(['avg_ratings'], axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    X_train, X_val, X_test = preprocessor(X_train, X_val, X_test)
    X_train, X_val, X_test = selectFeatures(X_train, y_train, X_val, X_test)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

In [499]:
X_train, X_val, X_test, y_train, y_val, y_test = proccessing('data_10000.csv')
X_train.head()

Number of duplicated rows: 1
Number of missing values: 
title              0
author             0
language         211
avg_ratings        0
ratings_count      0
reviews_count      0
publisher        372
publish_year      61
num_pages        124
page_format       44
genres             0
link               0
dtype: int64


Unnamed: 0,language,ratings_count,reviews_count,publish_year,num_pages
0,0.098921,0.40395,0.692743,8.326673e-17,-0.07148
1,0.098921,0.138715,0.027054,-2.775558e-17,-0.060487
2,0.098921,0.361205,0.242546,-8.326673e-17,-1.323114
3,0.098921,-0.574367,-0.957856,0.0,0.015445
4,0.098921,0.69834,1.499411,1.387779e-16,0.052767
