![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab | Random Forests

For this lab, you will be using the CSV files provided in the `files_for_lab` folder.

### Instructions

- Apply the Random Forests algorithm but this time only by upscaling the data using `SMOTE`.
- Note that since `SMOTE` works on numerical data only, we will first encode the categorical variables in this case.

<h1 style="color: #00BFFF;">00 |</h1>

In [31]:
# 📚 Basic libraries
import pandas as pd # data manipulation
import numpy as np # numerical operations
import matplotlib.pyplot as plt # 2D visualizations
import warnings # warning messages management

# 🤖 Machine Learning
from sklearn.preprocessing import MinMaxScaler # data scaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split # train/test sets
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, confusion_matrix, ConfusionMatrixDisplay # roc curve, confussion matrix & displayer

In [2]:
# ⚙️ Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

In [3]:
# 🔧 Basic functions
def explore_data(data): # sum & returns duplicates, NaN & empty spaces
    duplicate_rows = data.duplicated().sum()
    nan_values = data.isna().sum()
    empty_spaces = data.eq(' ').sum()
    import pandas as pd
    exploration = pd.DataFrame({"NaN": nan_values, "EmptySpaces": empty_spaces}) # New dataframe with the results
    print(f"There are {data.duplicated().sum()} duplicate rows. Also;")
    return exploration

<h1 style="color: #00BFFF;">01 | Data Extraction</h1>

In [4]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData.head(5) # call of the churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


<h1 style="color: #00BFFF;">02 | EDA</h1>

In [5]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

<h1 style="color: #00BFFF;">03 | Data Cleaning</h1>

In [6]:
churnData.duplicated().sum()

49

In [7]:
# Let's drop the duplicates
churnData = churnData.drop_duplicates()

In [8]:
churnData.duplicated().sum()

0

In [9]:
# So... apart from nulls, duplicates, what else we can check?
churnData.eq(' ').sum() # empty spaces ofc, eq = an empty space & sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [10]:
# Muahahaha, there are ofc some empty spaces we are not gonna miss

In [11]:
churnData['TotalCharges'] = churnData['TotalCharges'].replace(' ', np.nan)  # replacing it with NaN 
churnData = churnData.dropna(subset=['TotalCharges']) # and dropping them like NaN

In [12]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

<h1 style="color: #00BFFF;">04 | Data Pre-Processing</h1>

In [13]:
churnData["Churn"] = churnData["Churn"].apply(lambda x: 1 if x == "Yes" else 0)

In [21]:
X = churnData[['tenure', 'SeniorCitizen', 'MonthlyCharges', 'TotalCharges']] # Selecting the features from the lab
y = churnData['Churn'] # Is our customer churn or not?

In [22]:
# StandardScaler dosen't make any sense, since it's not normal distributed.
# We will use minmax, even to seniorcitizen (it will not affect the model)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [27]:
# Apply SMOTE
smote = SMOTE()

In [28]:
X_sm, y_sm = smote.fit_resample(X_scaled, y)

In [29]:
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=42)

In [32]:
# Apply the random forest classifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

<h1 style="color: #00BFFF;">05 | Predictions</h1>

In [33]:
y_pred = rf.predict(X_test)

In [34]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.82      0.75      0.78      1598
           1       0.76      0.82      0.79      1482

    accuracy                           0.79      3080
   macro avg       0.79      0.79      0.79      3080
weighted avg       0.79      0.79      0.79      3080



In [35]:
# Compared to previous labs, RandomForestClassifier gives the best results !

In [36]:
# Sorry for lazy and quick responses... I wanted to catch up with past labs, now I can focus on recordings and notes again
# I will definitely use this knowledge for future models