## 00 |

In [39]:
# [Basic Libraries]
import os # file management 
import pandas as pd # data manipulation
import numpy as np # numerical operations

# [Machine Learning]
from sklearn.preprocessing import StandardScaler # data normalization
from sklearn.model_selection import train_test_split # train/test sets
from sklearn.linear_model import LogisticRegression # logistic model
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, cohen_kappa_score # confussion matrix metric & displayer, cohen kappa score

# [New!] How to deal with imbalanced Data?
from imblearn.under_sampling import RandomUnderSampler # Random undersampling technique
from imblearn.over_sampling import RandomOverSampler # Random oversampling technique
from imblearn.under_sampling import TomekLinks # Undersampling librarie technique
from imblearn.over_sampling import SMOTE # Oversampling librarie technique
from sklearn.metrics import classification_report # Metrics to check & compare our score

# [Settings]
import warnings
warnings.filterwarnings('ignore') # ignore warnings
pd.set_option('display.max_columns', None) # display all columns

In [7]:
# Basic functions
def data_info(data): # improved data.info()
    print(f"Data shape is {data.shape}.")
    print()
    print(data.dtypes)
    print()
    print("Data row sample and full columns:")
    return data.sample(5)

def clean_columns(data): # Standardizes the column name, returns the columns
    data.columns = [i.lower().replace(' ', '_') for i in data.columns]
    return data.sample(0)

## 01 | Data Extraction

In [9]:
file_path = os.path.join("C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/unit_3_sql/lab-imbalanced-data/data", "customer_churn.csv")
data = pd.read_csv(file_path)
data_info(data)

Data shape is (7043, 21).

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

Data row sample and full columns:


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4886,2673-ZALNP,Female,0,No,No,7,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Mailed check,19.9,173.15,No
6745,5515-AKOAJ,Female,0,No,No,54,Yes,Yes,Fiber optic,No,No,No,Yes,Yes,Yes,Month-to-month,Yes,Electronic check,100.1,5440.9,Yes
2481,6614-YWYSC,Male,1,Yes,No,61,Yes,Yes,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),25.0,1501.75,No
98,3212-KXOCR,Male,0,No,No,52,Yes,No,No,No internet service,No internet service,No internet service,No internet service,No internet service,No internet service,Two year,No,Bank transfer (automatic),21.0,1107.2,No
5389,9701-CDXHR,Female,0,Yes,No,51,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,No,Mailed check,69.15,3649.6,No


<div class="alert alert-block alert-info">
    
**Binnary Classification Prbolem**
    
We want to predict `churn` (AKA; Is the customer still doing business with us?) **yes** or **no** using `seniorcitizen`,	`tenure`, `monthlycharges`as features (we will select them by selecting only numericals.
    
The final **goal** of this lab is to learn how to deal with imbalanced data. So... let's walk thorugh it in our own shoes!
</div>

## 02 | Data Wrangling

In [15]:
# Good practices, not necessary for a lab
# c_data = data.copy()

In [16]:
clean_columns(data) # standarize columns

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn


In [17]:
# How is our target data distributed?
data['churn'].value_counts()

No     5174
Yes    1869
Name: churn, dtype: int64

<blockquote style="background-color: #ffeeba; color: #856404; border-color: #ffeeba; padding: 10px; border-radius: 5px;">

There are is a huge imbalance representation from the two categories    
</blockquote>

In [13]:
5174/(5174+1869)*100

73.4630129206304

<blockquote style="background-color: #d4edda; color: #155724; border-color: #c3e6cb; padding: 10px; border-radius: 5px;">
    
There is a **73%** percentatge imbalance
    
</blockquote>

### Selecting Numericals### Normalizing the data

In [14]:
# Selecting our futures
n = data.select_dtypes(exclude=np.object)
n.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


## 03 | Data Processing

### Normalizing the data

In [19]:
# Normalazing the data
sd = StandardScaler().fit(n)

# Transformation
n_scaled = sd.transform(n)

# Making it a panda dataframe
ns = pd.DataFrame(n_scaled)

# Keeping old column names
ns.columns = n.columns

ns.head()

Unnamed: 0,seniorcitizen,tenure,monthlycharges
0,-0.439916,-1.277445,-1.160323
1,-0.439916,0.066327,-0.259629
2,-0.439916,-1.236724,-0.36266
3,-0.439916,0.514251,-0.746535
4,-0.439916,-1.236724,0.197365


### X-Y Split

In [20]:
# We need to be careful about when and how we do the split
Y = data['churn']
X = ns

In [21]:
# We define train and test for X and Y
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) 

# test_size = We give 30% for testing and 70% for testing
# random_state = it'll improve the model to divide always the model in the same way

In [22]:
len(X) # Len before the test

7043

In [23]:
len(X_test) # Len after testing 30% of X data

2113

In [24]:
len(X_train) # Len after training 70% of X data

4930

### Logistic Regression

In [25]:
# We traing the training dataset using Logistic Regression
logistic = LogisticRegression(random_state=0).fit(X_train, y_train)

In [29]:
# Predictions
predictions = logistic.predict(X_test)

In [30]:
logistic.score(X_test, y_test)

0.7936583057264552

<blockquote style="background-color: #ffeeba; color: #856404; border-color: #ffeeba; padding: 10px; border-radius: 5px;">

Even with the imbalance, we got a 79% score.
</blockquote>

In [31]:
logistic_1 = classification_report(y_test, predictions)
print(logistic_1)

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1539
         Yes       0.68      0.45      0.54       574

    accuracy                           0.79      2113
   macro avg       0.75      0.69      0.70      2113
weighted avg       0.78      0.79      0.78      2113



![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png)

# Lab-Time| Dealing with imbalanced data

#### For this lab, we will use different Python scripts using `os`, just to play around and for a better organization.

In [32]:
Cdata = pd.DataFrame(data)

In [38]:
data.to_csv('C:/Users/apisi/01. IronData/01. GitHub/01. IronLabs/unit_3_sql/lab-imbalanced-data/data/Cdata.csv')

#### We created a new file `Cdata` with all our clean (standarized) data from `customer_churn.csv`. We will use it for all our imbalance practices.