## 00. Coding Best Practices

In [44]:
# Basic libraries
import pandas as pd # data manipulatioN
import numpy as np # numerical operations
import matplotlib.pyplot as plt # 2D visualizations
import seaborn as sns # high-resolution visualization
import warnings # warning messages management

# SQL connection
from sqlalchemy import create_engine # let's connect to SQL
from getpass import getpass

# Machine Learning
from sklearn.preprocessing import LabelEncoder # label encoding
from sklearn.model_selection import train_test_split # splitting data into train/test sets
from sklearn.linear_model import LogisticRegression # logistic model

# Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

In [40]:
# Basic functions
def data_info(): # improved data.info()
    print(f"The DataFrame shape is {data.shape}.")
    print()
    print("DataFrame random row sample and full columns:")
    return data.sample(5)

## 01 - Getting the Data

In [41]:
# Creating a safe connection
connection_string = 'mysql+pymysql://root:' + password + '@localhost/sakila'
engine = create_engine(connection_string)
password = getpass("Please, kindly insert your password:")

Please, kindly insert your password:········


In [42]:
# Getting the data
data = pd.read_sql_query('SELECT * FROM logistic_data;', engine)
data.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,rating,special_features,name,n_rentals
253,178,6,0.99,115,G,"Deleted Scenes,Behind the Scenes",Comedy,6.0
569,900,6,2.99,136,R,Behind the Scenes,Foreign,14.0
347,650,3,0.99,144,G,"Trailers,Commentaries,Deleted Scenes,Behind th...",Documentary,24.0
366,945,5,0.99,164,R,Deleted Scenes,Documentary,28.0
205,341,5,0.99,82,PG,"Trailers,Deleted Scenes",Classics,29.0


In [43]:
data_info()

The DataFrame shape is (1000, 8).

DataFrame random row sample and full columns:


Unnamed: 0,film_id,rental_duration,rental_rate,length,rating,special_features,name,n_rentals
998,988,7,2.99,139,R,"Trailers,Commentaries,Behind the Scenes",Travel,15.0
702,133,7,4.99,117,NC-17,Trailers,Music,12.0
169,688,6,0.99,61,PG,"Trailers,Commentaries,Deleted Scenes,Behind th...",Children,18.0
390,272,5,4.99,153,NC-17,Deleted Scenes,Drama,15.0
122,886,5,0.99,184,PG-13,"Deleted Scenes,Behind the Scenes",Animation,10.0


<div class="alert alert-block alert-info">
    
**First impression:**
    
_____________

The following database is a collection of over **one-year** (from 2005/05/24 to 2006/02/14) 1.000 film's information among 8 different columnns.
    
Our **binary target** will the probability of a movie being rented again or not in the future (low or high) --> We will therefore classify movies based on low/high (binary) demand renting rate by analyzing the total number of rentals for each film within the time period (n_rentals)
    
**Firstly**, data cleaning:
_____________
</div>

## 02 - Cleaning the Data

In [51]:
# copy as best practices
data_copy = data.copy()

### Encoding
<div class="alert alert-block alert-info">
    
* We imported features such as `rating`, `special_features` and `name` as categoricals, we will use LabelEncoder to encode them to numericals:
</div>

In [54]:
# Initialize the LabelEncoder object from sklearn
le = LabelEncoder()

# Fit and transform the data frame column
data_copy['rating_ecd'] = le.fit_transform(data_copy['rating'])
data_copy['special_features_ecd'] = le.fit_transform(data_copy['special_features'])
data_copy['name_ecd'] = le.fit_transform(data_copy['name'])

# Visualizing the changes
data_encoded_f = data_copy[['rating_ecd', 'special_features_ecd', 'name_ecd']]
data_comparision_2 = pd.concat([data_encoded_f, data_copy[['rating', 'special_features', 'name']]], axis=1)
data_comparision_2.sample(5)

Unnamed: 0,rating_ecd,special_features_ecd,name_ecd,rating,special_features,name
859,3,6,13,PG-13,"Deleted Scenes,Behind the Scenes",Sci-Fi
361,3,8,5,PG-13,"Trailers,Behind the Scenes",Documentary
735,3,13,11,PG-13,"Trailers,Deleted Scenes",Music
642,2,6,10,PG,"Deleted Scenes,Behind the Scenes",Horror
537,1,9,8,NC-17,"Trailers,Commentaries",Foreign


### Selecting numericals / Dealing with Null values

<div class="alert alert-block alert-info">
    
* We will now select all numericals from `data_copy` to drop the categoricals.
* Also, we will use `.fillna(0)` to deal with NaN values from the LEFT JOIN with n_rental (explained in [SQL database-extraction](https://github.com/isi-mube/iron-labs/blob/main/unit_3_sql/lab-predictions-logistic-regression/notebook/sql_database_extraction_process.sql))
</div>

In [55]:
# Create a new dataframe with only the numerical columns using select_dtypes
X_N = data_copy.select_dtypes(np.number).fillna(0) # we also deal with NaN values
X_N.head(10)

Unnamed: 0,film_id,rental_duration,rental_rate,length,n_rentals,rating_ecd,special_features_ecd,name_ecd
0,19,6,0.99,113,20.0,2,4,0
1,21,3,4.99,129,21.0,4,2,0
2,29,5,2.99,168,10.0,1,11,0
3,38,6,0.99,68,0.0,1,12,0
4,56,6,2.99,129,18.0,0,9,0
5,67,5,2.99,77,20.0,3,5,0
6,97,7,0.99,56,19.0,0,10,0
7,105,6,0.99,125,15.0,1,5,0
8,111,3,0.99,52,16.0,1,3,0
9,115,5,2.99,167,19.0,4,0,0


![image.png](attachment:image.png)

### Creating the target

<div class="alert alert-block alert-info">
    
* Futher explanation
</div>

In [61]:
# Now, we will get a list of all column names to just quickly move the target at the end
data_headers = list(X_N.columns.values) # to get and check all column names
print("The Column Headers are :", data_headers) 

The Column Headers are : ['film_id', 'rental_duration', 'rental_rate', 'length', 'rating_ecd', 'special_features_ecd', 'name_ecd', 'n_rentals']


In [62]:
# We simply move our target ________ to the right, for readibility and remove the rest of the columns that are summarized in other stats
X_N = X_N[['film_id', 'rental_duration', 'rental_rate', 'length', 'rating_ecd', 'special_features_ecd', 'name_ecd', 'n_rentals']]
X_N.shape

(1000, 8)

In [63]:
print("The Column Headers are :", data_headers) 

The Column Headers are : ['film_id', 'rental_duration', 'rental_rate', 'length', 'rating_ecd', 'special_features_ecd', 'name_ecd', 'n_rentals']


<div class="alert alert-block alert-success">

**Now**, we have a cleaned dataset with `1.000` film's information in `8` distinct numerical features.
</div>

In [64]:
X_N.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,rating_ecd,special_features_ecd,name_ecd,n_rentals
24,303,6,0.99,58,3,0,0,26.0
97,470,4,4.99,79,4,11,1,7.0
405,585,4,0.99,105,0,7,6,12.0
50,794,5,0.99,52,0,6,0,12.0
554,640,5,4.99,102,4,3,8,11.0
