In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df=pd.read_csv("../input/wine-quality-dataset/WineQT.csv")
df.info()

In [None]:
df.head()

In [None]:
fig,sx=plt.subplots(figsize=(20,20))
sns.heatmap(df.corr(),annot=True)

In [None]:
df.corr()['quality'].sort_values()

#From above output we can see that alcohol and suplahtes have more impact on quality than other columns so we choose both of the columns to find any outliers

In [None]:
plt.scatter(x=df['alcohol'],y=df['quality'])
plt.xlabel('alcohol')
plt.ylabel('quality')

In [None]:
df[(df['quality']>=5)&(df['alcohol']>14)]

In [None]:
plt.scatter(x=df['sulphates'],y=df['quality'])
plt.xlabel('sulphates')
plt.ylabel('quality')

In [None]:
df[(df['quality']>=5)&(df['sulphates']>1.75)]

From outputs we can see there were no outliers we tried to find common index that which is an outlier.

In [None]:
df.isna().sum()

No Null values in data so we will split data into train and test 

In [None]:
from sklearn.linear_model import LinearRegression,Ridge,RidgeCV,Lasso,LassoCV,ElasticNet,ElasticNetCV
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.preprocessing import StandardScaler,MinMaxScaler,PolynomialFeatures
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
np.random.seed(10)


x=df.drop(['quality','Id'],axis=1)
y=df['quality']


model=LinearRegression()
train_preds=[]
test_preds=[]
for i in range(1,7):
    poly=PolynomialFeatures(degree=i,include_bias=False)
    converted_x=poly.fit_transform(x)
    x_train,x_test,y_train,y_test=train_test_split(converted_x,y,test_size=0.3,random_state=10)
    model.fit(x_train,y_train)
    train_preds.append(np.sqrt(mean_squared_error(y_train,model.predict(x_train))))
    test_preds.append(np.sqrt(mean_squared_error(y_test,model.predict(x_test))))

In [None]:
plt.plot(train_preds,label='train_preds')
plt.plot(test_preds,label='test_preds')
plt.legend()

From the Above Graph We can state at 4 and after 4 predictions were going wrong so we prefer to select degree for featurescalling less than 4
we can choose 2

In [None]:
np.random.seed(10)


x=df.drop('quality',axis=1)
y=df['quality']


poly=PolynomialFeatures(degree=2,include_bias=False)
converted_x=poly.fit_transform(x)


x_train,x_test,y_train,y_test=train_test_split(converted_x,y,test_size=0.3,random_state=10)

In [None]:
#we will be trying various model of regression that were imported from linear_model and we will pick the model which has best score

models=[]
models.append(('RandomForest',RandomForestRegressor()))
models.append(('linear_rigression',LinearRegression()))
models.append(('Ridge',Ridge(alpha=10.0)))
models.append(('RidgeCV',RidgeCV(alphas=(0.1,1.0,10.0),scoring="neg_mean_squared_error")))
models.append(('Lasso',Lasso()))
models.append(('LassoCV',LassoCV()))
models.append(('ElasticNet',ElasticNet()))
models.append(('ElasticNetCV',ElasticNetCV()))
models.append(('SVR',SVR()))
models.append(('KNR',KNeighborsRegressor()))
models.append(('DTR',DecisionTreeRegressor()))
models.append(('BagingRegressor',BaggingRegressor()))
models.append(('AdaBoostregressor',AdaBoostRegressor()))
models.append(('XGBR',XGBRegressor()))
models.append(('LGBMR',LGBMRegressor()))

models_score_MAE={}
models_score_RMSE={}
models_score_rscore={}
for i,name in models:
    clf=name
    clf.fit(x_train,y_train)
    y_preds=clf.predict(x_test)
    models_score_RMSE[i]=np.sqrt(mean_squared_error(y_test,y_preds))
    models_score_MAE[i]=mean_absolute_error(y_test,y_preds)
    models_score_rscore[i]=r2_score(y_test,y_preds)

In [None]:
models_score_RMSE

In [None]:
plt.bar(x=models_score_RMSE.keys(),height=models_score_RMSE.values())
plt.xticks(rotation=90)
plt.title('models_score_RMSE')

In [None]:
models_score_MAE

In [None]:
plt.bar(x=models_score_MAE.keys(),height=models_score_MAE.values())
plt.xticks(rotation=90)
plt.title('models_score_MAE')

In [None]:
models_score_rscore

In [None]:
plt.bar(x=models_score_rscore.keys(),height=models_score_rscore.values())
plt.xticks(rotation=90)
plt.title('models_score_rscore')

From Both models_score_MAE and models_score_RMSE we can predict that linear_regression has low MAE and RMSE so we prefer RandomForestRegressor()

In [None]:
final_model=RandomForestRegressor()
poly=PolynomialFeatures(degree=2,include_bias=False)
converted_x=poly.fit_transform(x)
x_train,x_test,y_train,y_test=train_test_split(converted_x,y,test_size=0.3,random_state=10)
final_model.fit(x_train,y_train)
y_preds=final_model.predict(x_test)
r2_score(y_test,y_preds)

We will try using GridSearchCV

In [None]:
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}


clf = RandomForestRegressor(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid, 
                            n_iter=10, # number of models to try
                            cv=5,
                            verbose=2,
                            random_state=42, # set random_state to 42 for reproducibility
                            refit=True) # set refit=True (default) to refit the best model on the full dataset 

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(x_train, y_train)

In [None]:
rs_clf.best_params_

In [None]:
rs_clf.score(x_test,y_test)

In [None]:
grid_2 = {'n_estimators': [1200, 1150, 1250],
          'max_depth': ['sqrt', None],
          'max_features': ['sqrt'],
          'min_samples_split': [2],
          'min_samples_leaf': [2, 4]}

# Instantiate RandomForestClassifier
clf = RandomForestRegressor(n_jobs=1)

# Setup GridSearchCV
gs_clf = GridSearchCV(estimator=clf,
                      param_grid=grid_2, 
                      cv=5,
                      verbose=2,
                      refit=True) # set refit=True (default) to refit the best model on the full dataset

# Fit the GridSearchCV version of clf
gs_clf.fit(x_train, y_train)
gs_clf.best_params_