In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
housing_df = pd.read_csv('boston_housing.csv')

In [3]:
housing_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,,36.2


In [4]:
housing_df = housing_df.dropna()

In [5]:
X = housing_df.iloc[:, :-1]
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,0.17783,0.0,9.69,0.0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10
500,0.22438,0.0,9.69,0.0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64


In [6]:
y = housing_df.iloc[:, -1]
y

0      24.0
1      21.6
2      34.7
3      33.4
5      28.7
       ... 
499    17.5
500    16.8
502    20.6
503    23.9
504    22.0
Name: MEDV, Length: 394, dtype: float64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
reg = LinearRegression()

In [9]:
reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [10]:
y_pred = reg.predict(X_test)

In [11]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 5.419428575501432


In [12]:
def regression_model(model):
    # Create training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Create the regressor
    reg_all = model
    
    # Fit the regressor to the training data
    reg_all.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = reg_all.predict(X_test)
    
    # Compute and print RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"Root Mean Squared Error: {rmse}")

In [13]:
regression_model(LinearRegression())

Root Mean Squared Error: 4.507900730070378


In [14]:
regression_model(LinearRegression())

Root Mean Squared Error: 4.571394099527376


In [15]:
regression_model(LinearRegression())

Root Mean Squared Error: 5.118675038397321


In [16]:
# More accurate than just regression model by including cross validation

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
def regression_model_cv(model, k=5):
    score = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-score)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean())

In [19]:
regression_model_cv(LinearRegression())

Reg rmse: [3.26123843 4.42712448 5.66151114 8.09493087 5.24453989]
Reg mean: 5.337868962878373


In [20]:
regression_model_cv(LinearRegression(), k=3)

Reg rmse: [ 3.72504914  6.01655701 23.20863933]
Reg mean: 10.983415161090695


In [21]:
regression_model_cv(LinearRegression(), k=6)

Reg rmse: [3.23879491 3.97041949 5.58329663 3.92861033 9.88399671 3.91442679]
Reg mean: 5.08659081080109


In [22]:
# Ridge

In [23]:
from sklearn.linear_model import Ridge

In [24]:
regression_model_cv(Ridge())

Reg rmse: [3.17202127 4.54972372 5.36604368 8.03715216 5.03988501]
Reg mean: 5.232965166251768


In [25]:
# Lasso

In [26]:
from sklearn.linear_model import Lasso

In [27]:
regression_model_cv(Lasso())

Reg rmse: [3.52318747 5.70083491 7.82318757 6.9878025  3.97229348]
Reg mean: 5.60146118538429


In [28]:
# KNN

In [29]:
from sklearn.neighbors import KNeighborsRegressor

In [30]:
regression_model_cv(KNeighborsRegressor())

Reg rmse: [ 8.24568226  8.81322798 10.58043836  8.85643441  5.98100069]
Reg mean: 8.495356738515685


In [31]:
regression_model_cv(KNeighborsRegressor(n_neighbors=4))

Reg rmse: [ 8.44659788  8.99814547 10.97170231  8.86647969  5.72114135]
Reg mean: 8.600813339223432


In [32]:
regression_model_cv(KNeighborsRegressor(n_neighbors=7))

Reg rmse: [ 7.99710601  8.68309183 10.66332898  8.90261573  5.51032355]
Reg mean: 8.351293217401393


In [33]:
regression_model_cv(KNeighborsRegressor(n_neighbors=10))

Reg rmse: [ 7.47549287  8.62914556 10.69543822  8.91330686  6.52982222]
Reg mean: 8.448641147609868


In [34]:
# GridSearchCV

In [35]:
from sklearn.model_selection import GridSearchCV

In [36]:
neighbors = np.linspace(1, 20, 20)

In [37]:
k = neighbors.astype(int)

In [38]:
param_grid = {'n_neighbors': k}

In [39]:
knn = KNeighborsRegressor()

In [40]:
knn_tuned = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')

In [41]:
knn_tuned.fit(X, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [42]:
k = knn_tuned.best_params_
print(f"Best n_neighbors: {k}")

score = knn_tuned.best_score_
rsm = np.sqrt(-score)
print(f"Best score: {rsm}")

Best n_neighbors: {'n_neighbors': 7}
Best score: 8.516767055977628


In [43]:
# Decision Tree

In [44]:
from sklearn import tree

In [45]:
regression_model_cv(tree.DecisionTreeRegressor(random_state=0))

Reg rmse: [3.76418835 7.09740548 7.43743525 6.65318045 5.60443963]
Reg mean: 6.111329831868544


In [46]:
# Random Forest

In [47]:
from sklearn.ensemble import RandomForestRegressor

In [48]:
regression_model_cv(RandomForestRegressor(random_state=0))

Reg rmse: [3.21572317 3.72039739 4.92919054 6.6140911  3.76347521]
Reg mean: 4.448575483510956


In [49]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=100, random_state=0))

Reg rmse: [3.21572317 3.72039739 4.92919054 6.6140911  3.76347521]
Reg mean: 4.448575483510956


In [50]:
# Randomized Search CV

In [51]:
from sklearn.model_selection import RandomizedSearchCV

In [52]:
param_grid = {
    'max_depth': [None, 10, 30, 50, 70, 100, 200, 400],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3],
    'max_features': ['auto', 'sqrt']
}

In [53]:
reg = RandomForestRegressor(n_jobs=-1, random_state=0)

In [54]:
reg_tuned = RandomizedSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error')

In [55]:
reg_tuned.fit(X, y)

RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100, n_jobs=-1,
                   

In [56]:
p = reg_tuned.best_params_
print(f'Best n_neighbors: {p}')

score = reg_tuned.best_score_
rsm = np.sqrt(-score)
print(f'Best score: {rsm}')

Best n_neighbors: {'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': None}
Best score: 4.632046143933888


In [57]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=500, random_state=0))

Reg rmse: [3.18067908 3.73897569 4.83724261 6.50199561 3.92786755]
Reg mean: 4.4373521050792615


In [58]:
# Import, prepare and clean data

In [59]:
import numpy as np
import pandas as pd

In [60]:
col_names = ['Mean of integrated profile', 'Standard deviation of integrated profile', 
               'Excess kurtosis of integrated profile', 'Skewness of integrated profile',
               'Mean of DM-SNR curve', 'Standard deviation of DM-SNR curve',
               'Excess kurtosis of DM-SNR curve', 'Skewness of DM-SNR curve', 'Class' ]

df = pd.read_csv('HTRU_2.csv', header=None, names=col_names)

# df.columns = [['Mean of integrated profile', 'Standard deviation of integrated profile', 
#                'Excess kurtosis of integrated profile', 'Skewness of integrated profile',
#                'Mean of DM-SNR curve', 'Standard deviation of DM-SNR curve',
#                'Excess kurtosis of DM-SNR curve', 'Skewness of DM-SNR curve', 'Class' ]]

In [61]:
df.head()

Unnamed: 0,Mean of integrated profile,Standard deviation of integrated profile,Excess kurtosis of integrated profile,Skewness of integrated profile,Mean of DM-SNR curve,Standard deviation of DM-SNR curve,Excess kurtosis of DM-SNR curve,Skewness of DM-SNR curve,Class
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   Mean of integrated profile                17898 non-null  float64
 1   Standard deviation of integrated profile  17898 non-null  float64
 2   Excess kurtosis of integrated profile     17898 non-null  float64
 3   Skewness of integrated profile            17898 non-null  float64
 4   Mean of DM-SNR curve                      17898 non-null  float64
 5   Standard deviation of DM-SNR curve        17898 non-null  float64
 6   Excess kurtosis of DM-SNR curve           17898 non-null  float64
 7   Skewness of DM-SNR curve                  17898 non-null  float64
 8   Class                                     17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [63]:
len(df)

17898

In [64]:
# Logistic regression

In [65]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [66]:
X = df.iloc[:, 0:8]
X

Unnamed: 0,Mean of integrated profile,Standard deviation of integrated profile,Excess kurtosis of integrated profile,Skewness of integrated profile,Mean of DM-SNR curve,Standard deviation of DM-SNR curve,Excess kurtosis of DM-SNR curve,Skewness of DM-SNR curve
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306
...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910


In [67]:
y = df.iloc[:, 8]
y

0        0
1        0
2        0
3        0
4        0
        ..
17893    0
17894    0
17895    0
17896    0
17897    0
Name: Class, Length: 17898, dtype: int64

In [68]:
def clf_model(model):
    clf = model
    scores = cross_val_score(clf, X, y)
    
    print('Scores:', scores)
    print('Mean score:', scores.mean())

In [69]:
clf_model(LogisticRegression(random_state=0))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Scores: [0.97458101 0.97988827 0.98100559 0.97736798 0.9782062 ]
Mean score: 0.9782098086135604


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [70]:
# Classifiers

In [71]:
# GaussianNB
from sklearn.naive_bayes import GaussianNB

In [72]:
clf_model(GaussianNB())

Scores: [0.96061453 0.92374302 0.94273743 0.92847164 0.96451523]
Mean score: 0.9440163679814436


In [73]:
# KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

In [74]:
clf_model(KNeighborsClassifier())

Scores: [0.96955307 0.96927374 0.97318436 0.9706622  0.97289746]
Mean score: 0.9711141653437728


In [75]:
# DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

In [76]:
clf_model(DecisionTreeClassifier(random_state=0))

Scores: [0.96843575 0.96424581 0.96871508 0.96227997 0.96954457]
Mean score: 0.9666442360073738


In [77]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [78]:
clf_model(RandomForestClassifier(random_state=0))

Scores: [0.97709497 0.98324022 0.98072626 0.97485331 0.97848561]
Mean score: 0.978880074800083


In [79]:
# Finding pulsar percentage from Dataset

In [80]:
df['Class'].count()

17898

In [81]:
df[df.Class == 1].Class.count()

1639

In [82]:
df[df.Class == 1].Class.count()/df.Class.count()

0.09157447759526204

In [83]:
# Confusion matrix with classifiation report

In [84]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [86]:
def confusion(model):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification report:', classification_report(y_test, y_pred))
    
    return clf

In [87]:
confusion(LogisticRegression())

Confusion Matrix: [[4049   29]
 [  73  324]]
Classification report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4078
           1       0.92      0.82      0.86       397

    accuracy                           0.98      4475
   macro avg       0.95      0.90      0.93      4475
weighted avg       0.98      0.98      0.98      4475



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [88]:
confusion(KNeighborsClassifier())

Confusion Matrix: [[4034   44]
 [  88  309]]
Classification report:               precision    recall  f1-score   support

           0       0.98      0.99      0.98      4078
           1       0.88      0.78      0.82       397

    accuracy                           0.97      4475
   macro avg       0.93      0.88      0.90      4475
weighted avg       0.97      0.97      0.97      4475



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [89]:
confusion(GaussianNB())

Confusion Matrix: [[3884  194]
 [  63  334]]
Classification report:               precision    recall  f1-score   support

           0       0.98      0.95      0.97      4078
           1       0.63      0.84      0.72       397

    accuracy                           0.94      4475
   macro avg       0.81      0.90      0.85      4475
weighted avg       0.95      0.94      0.95      4475



GaussianNB(priors=None, var_smoothing=1e-09)

In [90]:
confusion(RandomForestClassifier(random_state=0))

Confusion Matrix: [[4051   27]
 [  69  328]]
Classification report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4078
           1       0.92      0.83      0.87       397

    accuracy                           0.98      4475
   macro avg       0.95      0.91      0.93      4475
weighted avg       0.98      0.98      0.98      4475



RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [91]:
from sklearn.ensemble import AdaBoostClassifier

In [92]:
clf_model(AdaBoostClassifier())

Scores: [0.97430168 0.97988827 0.98128492 0.97597094 0.97708857]
Mean score: 0.977706874833175


In [93]:
confusion(AdaBoostClassifier())

Confusion Matrix: [[4046   32]
 [  74  323]]
Classification report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4078
           1       0.91      0.81      0.86       397

    accuracy                           0.98      4475
   macro avg       0.95      0.90      0.92      4475
weighted avg       0.98      0.98      0.98      4475



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [94]:
X = housing_df.iloc[:, :-1]
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,0.17783,0.0,9.69,0.0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10
500,0.22438,0.0,9.69,0.0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64


In [95]:
y = housing_df.iloc[:, -1]
y

0      24.0
1      21.6
2      34.7
3      33.4
5      28.7
       ... 
499    17.5
500    16.8
502    20.6
503    23.9
504    22.0
Name: MEDV, Length: 394, dtype: float64

In [96]:
from sklearn.ensemble import AdaBoostRegressor

In [97]:
regression_model_cv(AdaBoostRegressor())

Reg rmse: [3.63983494 3.95888095 5.84190876 6.46211371 4.2041783 ]
Reg mean: 4.821383333287069


### Activity 25

In [98]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [99]:
df = pd.read_csv('CHURN.csv')
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [100]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [101]:
df.isna().any()

customerID          False
gender              False
SeniorCitizen       False
Partner             False
Dependents          False
tenure              False
PhoneService        False
MultipleLines       False
InternetService     False
OnlineSecurity      False
OnlineBackup        False
DeviceProtection    False
TechSupport         False
StreamingTV         False
StreamingMovies     False
Contract            False
PaperlessBilling    False
PaymentMethod       False
MonthlyCharges      False
TotalCharges        False
Churn               False
dtype: bool

In [102]:
df['Churn'] = df['Churn'].replace({'Yes':1, 'No':0})
# df['Churn'] = df['Churn'].replace(to_replace=['No', 'Yes'], value=[0, 1])

In [103]:
df

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,0
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,0
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,1
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,0
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,0
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,0
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,0
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,1


In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [105]:
X = df.iloc[:, 1:-1]
X

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5
2,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15
3,Male,0,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75
4,Female,0,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5
7039,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9
7040,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45
7041,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6


In [106]:
y = df.iloc[:, -1]
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [107]:
X = pd.get_dummies(X)
X

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,...,TotalCharges_995.35,TotalCharges_996.45,TotalCharges_996.85,TotalCharges_996.95,TotalCharges_997.65,TotalCharges_997.75,TotalCharges_998.1,TotalCharges_999.45,TotalCharges_999.8,TotalCharges_999.9
0,0,1,29.85,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,34,56.95,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2,53.85,0,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,45,42.30,0,1,1,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,2,70.70,1,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7039,0,72,103.20,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7040,0,11,29.60,1,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
7041,1,4,74.40,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
def clf_model(model, cv=3):
    clf = model
    
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print('Scores:', scores)
    print('Mean score', scores.mean())

In [109]:
clf_model(LogisticRegression())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Scores: [0.79812606 0.80025554 0.80741372]
Mean score 0.8019317736683194


In [110]:
clf_model(KNeighborsClassifier())

Scores: [0.77938671 0.76320273 0.77290158]
Mean score 0.7718303381000114


In [111]:
clf_model(GaussianNB())

Scores: [0.27725724 0.28109029 0.27652322]
Mean score 0.2782902503153228


In [112]:
clf_model(RandomForestClassifier())

Scores: [0.78577513 0.78662692 0.78696208]
Mean score 0.7864547078477072


In [113]:
clf_model(AdaBoostClassifier())

Scores: [0.80366269 0.80451448 0.80059651]
Mean score 0.8029245594131428


In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [115]:
def confusion(model):
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    print('Confusion matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    
    return clf

In [116]:
confusion(AdaBoostClassifier())

Confusion matrix: [[1173  130]
 [ 224  234]]
Classification Report:               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1303
           1       0.64      0.51      0.57       458

    accuracy                           0.80      1761
   macro avg       0.74      0.71      0.72      1761
weighted avg       0.79      0.80      0.79      1761



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [117]:
confusion(RandomForestClassifier())

Confusion matrix: [[1185  118]
 [ 253  205]]
Classification Report:               precision    recall  f1-score   support

           0       0.82      0.91      0.86      1303
           1       0.63      0.45      0.52       458

    accuracy                           0.79      1761
   macro avg       0.73      0.68      0.69      1761
weighted avg       0.77      0.79      0.78      1761



RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [118]:
confusion(LogisticRegression())

Confusion matrix: [[1159  144]
 [ 229  229]]
Classification Report:               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1303
           1       0.61      0.50      0.55       458

    accuracy                           0.79      1761
   macro avg       0.72      0.69      0.71      1761
weighted avg       0.78      0.79      0.78      1761



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [119]:
confusion(AdaBoostClassifier(n_estimators=250))

Confusion matrix: [[1176  127]
 [ 229  229]]
Classification Report:               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1303
           1       0.64      0.50      0.56       458

    accuracy                           0.80      1761
   macro avg       0.74      0.70      0.72      1761
weighted avg       0.79      0.80      0.79      1761



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=250, random_state=None)

In [120]:
confusion(AdaBoostClassifier(n_estimators=25))

Confusion matrix: [[1171  132]
 [ 227  231]]
Classification Report:               precision    recall  f1-score   support

           0       0.84      0.90      0.87      1303
           1       0.64      0.50      0.56       458

    accuracy                           0.80      1761
   macro avg       0.74      0.70      0.71      1761
weighted avg       0.79      0.80      0.79      1761



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=25, random_state=None)

In [121]:
confusion(AdaBoostClassifier(n_estimators=15))

Confusion matrix: [[1187  116]
 [ 245  213]]
Classification Report:               precision    recall  f1-score   support

           0       0.83      0.91      0.87      1303
           1       0.65      0.47      0.54       458

    accuracy                           0.80      1761
   macro avg       0.74      0.69      0.70      1761
weighted avg       0.78      0.80      0.78      1761



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=15, random_state=None)