In [103]:
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import numpy as np
from sklearn import preprocessing, model_selection
from sklearn.ensemble import RandomForestRegressor, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn import metrics


In [48]:
df = pd.read_csv("data/wineQualityReds.csv")
display(df.head())
df.info()


Unnamed: 0.1,Unnamed: 0,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol,quality
0,1,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,2,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,3,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,4,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,5,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            1599 non-null   int64  
 1   fixed.acidity         1599 non-null   float64
 2   volatile.acidity      1599 non-null   float64
 3   citric.acid           1599 non-null   float64
 4   residual.sugar        1599 non-null   float64
 5   chlorides             1599 non-null   float64
 6   free.sulfur.dioxide   1599 non-null   float64
 7   total.sulfur.dioxide  1599 non-null   float64
 8   density               1599 non-null   float64
 9   pH                    1599 non-null   float64
 10  sulphates             1599 non-null   float64
 11  alcohol               1599 non-null   float64
 12  quality               1599 non-null   int64  
dtypes: float64(11), int64(2)
memory usage: 162.5 KB


In [49]:
df['quality'] = df['quality'].apply(lambda x: 1 if x>=6 else 0)
X = df.drop('quality', axis=1)
y = df['quality']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,y,test_size=0.3, random_state=42)


In [50]:
lr = LogisticRegression(random_state=42, max_iter=10000)
lr.fit(X_train, y_train)
print(f"Log Reg f1: {metrics.f1_score(y_test, lr.predict(X_test))}")
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': np.arange(3,14),
    'min_samples_leaf': np.arange(2,7),
    'min_samples_split': np.arange(3,12, 2)
}
gsv = model_selection.GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
                             param_grid=param_grid,
                             cv=5,
                             scoring='f1')
gsv.fit(X_train, y_train)
# dt = DecisionTreeClassifier(random_state=42, max_depth=10)
# dt.fit(X_train, y_train)
print(f"DT f1: {metrics.f1_score(y_test, gsv.best_estimator_.predict(X_test))}")


Log Reg f1: 0.7575757575757577
DT f1: 0.7648183556405354


In [51]:
gsv.best_params_

{'criterion': 'gini',
 'max_depth': 13,
 'min_samples_leaf': 2,
 'min_samples_split': 9}

In [52]:
bg = BaggingClassifier(estimator=DecisionTreeClassifier(**gsv.best_params_),
                       n_estimators=1500
                       )
bg.fit(X_train, y_train)
print(f"Bagging dt f1: {metrics.f1_score(y_test, bg.predict(X_test))}")


Bagging dt f1: 0.8150943396226414


In [86]:
boston = pd.read_csv('data/boston (1).csv')
for col in boston.columns:
    if boston[col].dtype == 'object':
        boston[col] = boston[col].str.replace(',','.').astype('float32')
display(boston.head())
boston.info()


Unnamed: 0,crim_rate,zn,business,river,nit_oxiden,rooms,age,dist,highways_index,tax,pup_per_teaс,lower,target
0,0.00632,18.0,2.31,0,0.538,6.575,65.199997,4.09,1,296,15.3,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.900002,4.9671,2,242,17.799999,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.099998,4.9671,2,242,17.799999,4.03,34.700001
3,0.03237,0.0,2.18,0,0.458,6.998,45.799999,6.0622,3,222,18.700001,2.94,33.400002
4,0.06905,0.0,2.18,0,0.458,7.147,54.200001,6.0622,3,222,18.700001,5.33,36.200001


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   crim_rate       506 non-null    float32
 1   zn              506 non-null    float32
 2   business        506 non-null    float32
 3   river           506 non-null    int64  
 4   nit_oxiden      506 non-null    float32
 5   rooms           506 non-null    float32
 6   age             506 non-null    float32
 7   dist            506 non-null    float32
 8   highways_index  506 non-null    int64  
 9   tax             506 non-null    int64  
 10  pup_per_teaс    506 non-null    float32
 11  lower           506 non-null    float32
 12  target          506 non-null    float32
dtypes: float32(10), int64(3)
memory usage: 31.8 KB


In [114]:
X = boston[boston.columns[:-1]]
Y = boston["target"]
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,
                                                                    Y, train_size=0.3,
                                                                    random_state=13)
y_train.mean()

23.346355

In [115]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
metrics.mean_absolute_error(y_test, lr.predict(X_test))


3.8478554725869305

In [116]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
metrics.mean_absolute_error(y_test, dt.predict(X_test))

3.429859064666318

In [117]:
for num in [3,10,100,500]:
    est = RandomForestRegressor(n_estimators=num, random_state=13)
    est.fit(X_train, y_train)
    print(f"num estimators: {num}, MAE test: {metrics.mean_absolute_error(y_test, est.predict(X_test))}")

num estimators: 3, MAE test: 3.1393426760821277
num estimators: 10, MAE test: 2.747183053997201
num estimators: 100, MAE test: 2.551518287551235
num estimators: 500, MAE test: 2.5711019312737693
