## 00. Coding Best Practices

In [30]:
# Basic libraries
import pandas as pd # data manipulatioN
import numpy as np # numerical operations
import matplotlib.pyplot as plt # 2D visualizations
import seaborn as sns # high-resolution visualization
import warnings # warning messages management

# SQL connection
from sqlalchemy import create_engine # let's connect to SQL
from getpass import getpass

# Machine Learning
from sklearn.preprocessing import StandardScaler # feature scaling
from sklearn.model_selection import train_test_split # splitting data into train/test sets
from sklearn.linear_model import LogisticRegression # logistic model

# Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

Please, kindly insert your password:········


In [38]:
# Basic functions
def data_info(): # improved data.info()
    print(f"The DataFrame shape is {data.shape}.")
    print()
    print("The DataFrame data types are:")
    print(data.dtypes.value_counts().tolist())
    print(data.dtypes.value_counts().index.tolist())
    print()
    print("DataFrame random row sample and full columns:")
    return data.sample(5)

## 01 - Getting the Data

In [36]:
# Creating a safe connection
connection_string = 'mysql+pymysql://root:' + password + '@localhost/sakila'
engine = create_engine(connection_string)
password = getpass("Please, kindly insert your password:")

Please, kindly insert your password:········


In [37]:
# Getting the data
data = pd.read_sql_query('SELECT * FROM logistic_data;', engine)
data.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,rating,special_features,name,n_rentals
872,102,4,4.99,60,R,"Trailers,Commentaries,Deleted Scenes,Behind th...",Sports,5.0
954,181,7,2.99,166,PG-13,Commentaries,Travel,26.0
553,623,4,0.99,75,PG,"Trailers,Commentaries,Deleted Scenes,Behind th...",Foreign,17.0
43,664,6,2.99,65,PG,"Trailers,Deleted Scenes",Action,16.0
673,722,5,0.99,123,NC-17,Behind the Scenes,Horror,11.0


In [39]:
data_info()

The DataFrame shape is (1000, 8).

The DataFrame data types are:
[3, 3, 2]
[dtype('int64'), dtype('O'), dtype('float64')]

DataFrame random row sample and full columns:


Unnamed: 0,film_id,rental_duration,rental_rate,length,rating,special_features,name,n_rentals
891,383,6,0.99,111,NC-17,Behind the Scenes,Sports,18.0
14,205,3,0.99,58,NC-17,"Trailers,Commentaries,Deleted Scenes,Behind th...",Action,12.0
906,598,6,0.99,57,G,Trailers,Sports,14.0
689,922,7,4.99,107,PG-13,Commentaries,Horror,21.0
848,602,5,0.99,146,PG,"Trailers,Commentaries,Deleted Scenes,Behind th...",Sci-Fi,21.0


<div class="alert alert-block alert-info">
    
**First impression:**
    
_____________

The following dataset is a collection of **one-year** (from 2005/02/01 to 20 17.125 player information distributed among 107 different columns. The majority of our data types are mostly **numericals** (52 object / 45 integers / 10 floats).
    
The following database is a collection of over **one-year** 1.000 film's information among 8 different columnns.
    
Our **binary target** will the probability of a movie being rented again or not in the future (low or high) --> We will therefore classify movies based on low/high (binary) demand renting rate by analyzing the total number of rentals for each film within the time period (n_rentals)

Our **project goal** is to identify players who have the potential to become **the next Mbappé**. After reading the [documentation](https://www.kaggle.com/datasets/ekrembayar/fifa-21-complete-player-dataset?select=fifa21_male2.csv) we decide to proceed with the following **strategy**:

1. The **target** of our dataset will be `OVA` (overall score), which is a summary of a player's performance and potential. 
1. The majority of the data types are **numericals**, so we will work with that.
3. Through **Exploratory Data Analysis** we will identify the features that contribute to this prediction.
_____________
</div>

## 02 - Cleaning the Data

## 02 - Cleaning the Data

In [21]:
# copy as best practices
data_copy = data.copy()

In [22]:
data_copy = pd.get_dummies(data_copy, columns=['rating', 'special_features'])
data_copy.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,name,n_rentals,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R,special_features_Behind the Scenes,special_features_Commentaries,"special_features_Commentaries,Behind the Scenes","special_features_Commentaries,Deleted Scenes","special_features_Commentaries,Deleted Scenes,Behind the Scenes",special_features_Deleted Scenes,"special_features_Deleted Scenes,Behind the Scenes",special_features_Trailers,"special_features_Trailers,Behind the Scenes","special_features_Trailers,Commentaries","special_features_Trailers,Commentaries,Behind the Scenes","special_features_Trailers,Commentaries,Deleted Scenes","special_features_Trailers,Commentaries,Deleted Scenes,Behind the Scenes","special_features_Trailers,Deleted Scenes","special_features_Trailers,Deleted Scenes,Behind the Scenes"
477,610,7,0.99,129,Family,19.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
642,13,4,4.99,150,Horror,8.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
131,59,4,2.99,160,Children,22.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
734,797,7,0.99,67,Music,16.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
220,525,4,0.99,140,Classics,26.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['rating_encoded'] = le.fit_transform(data['rating'])
le = LabelEncoder()
data['special_features'] = le.fit_transform(data['special_features'])
data.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,rating,special_features,name,n_rentals,rating_encoded
990,872,3,2.99,185,R,5,Travel,15.0,4
776,490,4,2.99,121,G,2,New,8.0,0
555,641,4,0.99,76,PG-13,9,Foreign,24.0,3
427,907,4,0.99,168,PG-13,7,Drama,15.0,3
877,187,5,2.99,57,NC-17,1,Sports,11.0,1
