In [263]:
import pandas as pd
# import ydata_profiling  as ProfileReport
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LarsCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

from lazypredict.Supervised import LazyRegressor


df = pd.read_csv('D:\Hutson\learning-materials\AI&ML\Khóa 12-02AIMLDLCV nâng cao\Class\Datasets\StudentScore.xls')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [264]:
df['parental level of education'].unique()

array(["bachelor's degree", 'some college', "master's degree",
       "associate's degree", 'high school', 'some high school'],
      dtype=object)

In [265]:
numeric_data = df.select_dtypes(include=[int, float])
df[numeric_data.columns].corr()  

Unnamed: 0,math score,reading score,writing score
math score,1.0,0.82,0.8
reading score,0.82,1.0,0.95
writing score,0.8,0.95,1.0


In [266]:
target = 'math score'

x = df.drop(target, axis=1)
y = df[target]

In [267]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [268]:
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

In [269]:
education_values = ["some high school", "high school", "some college", "associate's degree", "bachelor's degree", "master's degree"]
gender_values = ['male','female']
lunch_values = x_train['lunch'].unique()
test_preparation_course_values = x_train['test preparation course'].unique()


ord_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OrdinalEncoder(categories=[education_values,gender_values,lunch_values,test_preparation_course_values]))
])

In [270]:
nom_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('scaler', OneHotEncoder())
])

In [271]:
preprocessor = ColumnTransformer(transformers=[
    ("num_features", num_transformer, ["reading score", "writing score"]),
    ("nom_features", nom_transformer,["race/ethnicity"]),
    ("ord_features", ord_transformer,["parental level of education","gender","lunch","test preparation course"])]
    )

In [272]:
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', LarsCV())
                    ])



# processed_data = reg.fit_transform(x_train)
# pd.DataFrame(processed_data)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)
# for i, j in zip(y_pred, y_test):
#     print("Predicted: ", i, "Actual: ", j)

In [273]:
params = {
    "model__max_n_alphas": [100, 200, 300, 400, 500],
    "model__n_jobs": [1, 2, 3, 4, 5],
    "model__precompute": [True, False],
    "model__max_iter": [100, 200, 300],
    "model__cv": [3, 4, 5, 6, 7, 8]
}

model = RandomizedSearchCV(reg, param_distributions=params, cv=5, n_jobs=-1,verbose=2, scoring='r2', n_iter=9, random_state=42)


model.fit(x_train, y_train)
y_pred = model.predict(x_test) 

Fitting 5 folds for each of 9 candidates, totalling 45 fits


In [274]:
# reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)

# models, predictions = reg.fit(x_train, x_test, y_train, y_test)
# print(models)

In [275]:
print(model.best_params_)
print(model.best_score_)

{'model__precompute': True, 'model__n_jobs': 1, 'model__max_n_alphas': 300, 'model__max_iter': 300, 'model__cv': 4}
0.8697165544648893


In [276]:
print("mean square: {}".format(mean_squared_error(y_test, y_pred)))
print("mean absolute error: {}".format(mean_absolute_error(y_test, y_pred)))
print("R2: {}".format(r2_score(y_test, y_pred)))


### Tren 0.8 la tot, duoi 0.8 la xau

mean square: 28.6598115057876
mean absolute error: 4.1696384084800755
R2: 0.8822224050713426
