<a href="https://colab.research.google.com/github/huyhoang1996vn/my-ml/blob/master/ECER2_LS9_Regression_Remake.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
import pandas as pd
import numpy as np
from scipy.stats import randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz
from sklearn.model_selection import RandomizedSearchCV, train_test_split

In [25]:
# Read the Excel file
df = pd.read_excel('StudentScore.xlsx')

# Assuming the data is in the first column and comma-separated
# Split the single column into multiple columns
data = df.iloc[:, 0].str.split(',', expand=True)

# Assign meaningful column names (you might need to adjust these based on your data)
column_names = ["gender", "race/ethnicity", "parental level of education", "lunch", "test preparation course", "math score", "reading score", "writing score"]
data.columns = column_names
# Remove double quotes and convert to appropriate types
int_columns = ['math score', 'reading score', 'writing score']
for col in int_columns:
    # Remove leading and trailing double quotes
    data[col] = data[col].str.strip('"')

    # Attempt to convert to numeric, coerce errors to NaN
    data[col] = pd.to_numeric(data[col], errors='coerce')

    # If conversion to numeric resulted in NaNs, keep as object (string)
    # Otherwise, fill NaNs with None and convert to Int64 to handle potential NaNs in integer columns
    if data[col].dtype == 'float64':
        # Check if the original column could be converted to int before coercing errors
        try:
            # Attempt to convert to integer, handling NaNs
            data[col] = data[col].astype(pd.Int64Dtype())
        except:
            # If conversion to integer fails, keep as float and replace NaN with None
             data[col] = data[col].apply(lambda x: None if pd.isna(x) else x)
    else:
         # Replace empty strings that might result from stripping with None
         data[col] = data[col].replace('', None)
# Columns to remove double quotes from
string_columns = ['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course']

for col in string_columns:
    if col in data.columns:
        data[col] = data[col].str.replace('"', '', regex=False)
display(data.head())

# Display the first few rows of the new DataFrame
display(data.describe())

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.09,69.17,68.05
std,15.16,14.6,15.2
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [26]:
from sklearn.model_selection import train_test_split
y=data['writing score']
X=data.drop(columns=['writing score'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
X_train_clean = X_train.copy()
# Define the pipeline steps
scaler_pipeline = Pipeline([
    ('imputer_mean', SimpleImputer(missing_values=np.nan, strategy='mean')),
    ('scaler', StandardScaler()),
])
result = scaler_pipeline.fit_transform(X_train[['reading score', 'math score']])
X_train_clean['reading score'] = result[:, 0]
X_train_clean['math score'] = result[:, 1]

In [28]:
from sklearn.preprocessing import OrdinalEncoder
education_order = [
    'some high school',
    'high school',
    'some college',
    "associate's degree",
    "bachelor's degree",
    "master's degree"
]
gender_order = ['male', 'female']
lunch_order = ['standard', 'free/reduced']
test_order = ['completed', 'none']
ordinal_pipeline = Pipeline([
    ('imputer_most_frequent', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=[education_order, gender_order, lunch_order, test_order])),
])
result = ordinal_pipeline.fit_transform(\
      X_train[['parental level of education', 'gender', 'lunch', 'test preparation course']])
X_train_clean['parental level of education'] = result[:, 0]
X_train_clean['gender'] = result[:, 1]
X_train_clean['lunch'] = result[:, 2]
X_train_clean['test preparation course'] = result[:, 3]


In [29]:
from sklearn.preprocessing import OneHotEncoder

nominal_pipeline = Pipeline([
    ('imputer_most_frequent', SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ('nominal_encoder', OneHotEncoder(sparse_output=False)),
])
# Fit and transform all nominal columns at once
result = nominal_pipeline.fit_transform(X_train[['race/ethnicity']])
new_column_names = nominal_pipeline.named_steps['nominal_encoder'].get_feature_names_out(['race/ethnicity'])
encoded_nominal_df = pd.DataFrame(result, columns=new_column_names, index=X_train_clean.index)

# Concatenate the cleaned DataFrame with the one-hot encoded DataFrame
X_train_clean = pd.concat([X_train_clean, encoded_nominal_df], axis=1)

In [30]:
X_train.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score
29,female,group D,master's degree,standard,none,62,70
535,female,group C,bachelor's degree,free/reduced,completed,66,83
695,female,group D,some college,free/reduced,none,79,89
557,male,group C,master's degree,free/reduced,none,61,67
836,male,group E,high school,standard,none,73,64


In [31]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num_scaler', scaler_pipeline, ['reading score', 'math score']),
        ('ordinal_encoder', ordinal_pipeline, ['parental level of education', 'gender', 'lunch', 'test preparation course']),
        ('nominal_encoder', nominal_pipeline, ['race/ethnicity'])
    ])

# Fit and transform the data
transformed_x_train = preprocessor.fit_transform(X_train)

In [32]:
from sklearn.linear_model import LinearRegression
reg=Pipeline(steps=[('preprocessor',preprocessor),('regressor', LinearRegression(fit_intercept=False))])
reg.fit(X_train,y_train)

In [33]:
y_predict=reg.predict(X_test)

In [34]:
y_predict

array([88.87008167, 66.5011264 , 73.08408292, 71.1339806 , 79.76151845,
       73.68463711, 68.59952887, 62.57892369, 72.95235072, 51.25389302,
       41.53034408, 21.72976259, 79.14257286, 62.9688065 , 82.03071407,
       78.77448108, 49.46402173, 47.68427341, 57.14849121, 65.6603114 ,
       71.60772212, 55.66031902, 69.4026934 , 49.46983423, 77.6050834 ,
       73.7522358 , 73.75491287, 56.23014487, 48.34435586, 58.37157574,
       56.83541528, 64.80344249, 59.07968528, 66.02028795, 72.29924377,
       52.10314865, 74.48564271, 77.05586204, 79.64522417, 14.65846615,
       76.68599893, 63.22718659, 64.92488956, 61.5252036 , 84.19638548,
       64.52437861, 67.06405435, 32.02164466, 86.50649775, 83.01834076,
       72.18244932, 75.58291121, 77.7907192 , 58.47210869, 71.96554368,
       75.08562029, 78.70629149, 52.35709853, 80.55041691, 90.55006755,
       41.1185269 , 82.93626496, 80.03626601, 61.15714417, 88.56850015,
       77.48861371, 68.66314425, 50.34206935, 67.80788453, 93.32

In [35]:
from sklearn.metrics import mean_squared_error, r2_score
r2=r2_score(y_test, y_predict)
print(f"R-squared: {r2:.4f}")
mse = mean_squared_error(y_test, y_predict)
print(f"Mean squared error: {mse:.4f}")


# Calculate and print RMSE.
rmse = mse ** 0.5
print(f"Root mean squared error: {rmse:.4f}")

R-squared: 0.9378
Mean squared error: 14.9808
Root mean squared error: 3.8705


In [36]:
!pip install lazypredict



In [37]:
from lazypredict.Supervised import LazyRegressor
clf = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)

  0%|          | 0/42 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 164
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 19
[LightGBM] [Info] Start training from score 68.456250


In [38]:
# Print the performance of the models
print(models)

                               Adjusted R-Squared  R-Squared  RMSE  Time Taken
Model                                                                         
HuberRegressor                               0.94       0.94  3.84        0.04
LassoCV                                      0.94       0.94  3.85        0.06
ElasticNetCV                                 0.94       0.94  3.86        0.07
BayesianRidge                                0.94       0.94  3.86        0.02
Ridge                                        0.94       0.94  3.86        0.02
RidgeCV                                      0.94       0.94  3.86        0.02
TransformedTargetRegressor                   0.94       0.94  3.86        0.02
LinearRegression                             0.94       0.94  3.86        0.02
KernelRidge                                  0.94       0.94  3.87        0.05
RANSACRegressor                              0.94       0.94  3.87        0.03
SGDRegressor                                 0.94   

In [39]:
from sklearn.model_selection import GridSearchCV
simple_param_grid = {
    # 'preprocessor__num_scaler__imputer__strategy': ['mean', 'median'],
    # 'preprocessor__num_scaler__scaler__with_std': [True, False],
    'regressor__fit_intercept': [True, False],
    'regressor__n_jobs': [None, 1, 2]
}

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=reg,
    param_grid=simple_param_grid,  # or simple_param_grid for fewer combinations
    cv=5,                   # 5-fold cross-validation
    scoring='r2',  # or 'r2', 'neg_mean_absolute_error'
    n_jobs=-1,              # Use all available CPUs
    verbose=1               # Show progress
)

# Fit the grid search
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [40]:
grid_y_predict = grid_search.predict(X_test)
print(f"Best best_estimator: {grid_search.best_estimator_}")

# Print the best hyperparameters found
print(f"Best hyperparameters: {grid_search.best_params_}")

# Print the best accuracy achieved
print(f"Best cross-validated: {grid_search.best_score_:.4f}")
from sklearn.metrics import mean_squared_error, r2_score
r2=r2_score(y_test, grid_y_predict)
print(f"R-squared: {r2:.4f}")
mse = mean_squared_error(y_test, y_predict)
print(f"Mean squared error: {mse:.4f}")


# Calculate and print RMSE.
rmse = mse ** 0.5
print(f"Root mean squared error: {rmse:.4f}")

Best best_estimator: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num_scaler',
                                                  Pipeline(steps=[('imputer_mean',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['reading score',
                                                   'math score']),
                                                 ('ordinal_encoder',
                                                  Pipeline(steps=[('imputer_most_frequent',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('ordinal_encoder',
                                                          