In [12]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [13]:
data = pd.read_csv('C:/Users/ivanr/Downloads/lab-cross-validation/files_for_lab/Customer-Churn.csv')

In [14]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [15]:
y = data['Churn']
x = data.drop('Churn', axis=1)

In [16]:
numericals = x.select_dtypes(np.number)

In [17]:
categoricals = x.select_dtypes(np.object)

In [18]:
scaled = Normalizer().fit_transform(numericals)
scaled = pd.DataFrame(scaled)

In [19]:
encoded = OneHotEncoder(drop='first').fit_transform(categoricals).toarray()
encoded = pd.DataFrame(encoded)

In [20]:
features = pd.concat([scaled, encoded], axis = 1)

In [22]:
sm_features, sm_y  = SMOTE().fit_resample(features, y)

In [24]:
sm_y.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(sm_features, sm_y, test_size=0.2, random_state=40)

In [34]:
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()

In [35]:
models = [model1, model2]

In [36]:
model_names = ['LC', 'DTC']

In [39]:
model_scores = {}

In [40]:
i = 0
for model in models:
    scores = cross_val_score(model, X_train, y_train, cv = 5)
    model_scores[model_names[i]] = scores
    i =+ 1

In [41]:
model_scores

{'LC': array([0.80193237, 0.81582126, 0.81461353, 0.80241692, 0.80483384]),
 'DTC': array([0.81642512, 0.81582126, 0.82306763, 0.81389728, 0.80845921])}

In [42]:
tl = TomekLinks()
X_resampled, y_resampled = tl.fit_resample(features, y)

In [45]:
y_resampled.value_counts()

No     4598
Yes    1869
Name: Churn, dtype: int64

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=40)

In [47]:
i = 0
for model in models:
    scores = cross_val_score(model, X_train, y_train, cv = 5)
    model_scores[model_names[i]] = scores
    i =+ 1

In [48]:
model_scores

{'LC': array([0.79806763, 0.81545894, 0.8057971 , 0.80851064, 0.80367505]),
 'DTC': array([0.7468599 , 0.76811594, 0.78164251, 0.78916828, 0.77369439])}

In [49]:
X2_resampled, y2_resampled = tl.fit_resample(X_resampled, y_resampled)

In [50]:
y2_resampled.value_counts()

No     4431
Yes    1869
Name: Churn, dtype: int64

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X2_resampled, y2_resampled, test_size=0.2, random_state=40)

In [52]:
i = 0
for model in models:
    scores = cross_val_score(model, X_train, y_train, cv = 5)
    model_scores[model_names[i]] = scores
    i =+ 1

In [53]:
model_scores

{'LC': array([0.80257937, 0.8234127 , 0.80357143, 0.79265873, 0.80555556]),
 'DTC': array([0.77083333, 0.80257937, 0.79761905, 0.76388889, 0.77380952])}