## 00. Coding Best Practices

In [30]:
# Basic libraries
import pandas as pd # data manipulatioN
import numpy as np # numerical operations
import matplotlib.pyplot as plt # 2D visualizations
import seaborn as sns # high-resolution visualization
import warnings # warning messages management

# SQL connection
from sqlalchemy import create_engine # let's connect to SQL
from getpass import getpass
password = getpass("Please, kindly insert your password:")

# Machine Learning
from sklearn.preprocessing import StandardScaler # feature scaling
from sklearn.model_selection import train_test_split # splitting data into train/test sets
from sklearn.linear_model import LogisticRegression # logistic model

# Settings
pd.set_option('display.max_columns', None) # display all columns
warnings.filterwarnings('ignore') # ignore warnings

Please, kindly insert your password:········


## 01 - Getting the Data

In [32]:
# Creating a safe connection
connection_string = 'mysql+pymysql://root:' + password + '@localhost/sakila'
engine = create_engine(connection_string)

In [33]:
# Getting the data
data = pd.read_sql_query('SELECT * FROM logistic_data;', engine)
data.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,rating,special_features,name,n_rentals
636,956,7,4.99,107,PG-13,"Commentaries,Deleted Scenes,Behind the Scenes",Games,15.0
129,986,6,2.99,85,NC-17,"Trailers,Commentaries",Animation,18.0
597,362,7,2.99,115,PG-13,"Trailers,Commentaries,Behind the Scenes",Games,5.0
122,886,5,0.99,184,PG-13,"Deleted Scenes,Behind the Scenes",Animation,10.0
434,979,6,4.99,100,NC-17,"Commentaries,Behind the Scenes",Drama,30.0


<div class="alert alert-block alert-info">
    
**First impression:**
    
_____________

The following dataset is a collection of **one-year** 17.125 player information distributed among 107 different columns. The majority of our data types are mostly **numericals** (52 object / 45 integers / 10 floats).

Our **project goal** is to identify players who have the potential to become **the next Mbappé**. After reading the [documentation](https://www.kaggle.com/datasets/ekrembayar/fifa-21-complete-player-dataset?select=fifa21_male2.csv) we decide to proceed with the following **strategy**:

1. The **target** of our dataset will be `OVA` (overall score), which is a summary of a player's performance and potential. 
1. The majority of the data types are **numericals**, so we will work with that.
3. Through **Exploratory Data Analysis** we will identify the features that contribute to this prediction.
_____________
</div>

## 02 - Cleaning the Data

## 02 - Cleaning the Data

In [21]:
# copy as best practices
data_copy = data.copy()

In [22]:
data_copy = pd.get_dummies(data_copy, columns=['rating', 'special_features'])
data_copy.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,name,n_rentals,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R,special_features_Behind the Scenes,special_features_Commentaries,"special_features_Commentaries,Behind the Scenes","special_features_Commentaries,Deleted Scenes","special_features_Commentaries,Deleted Scenes,Behind the Scenes",special_features_Deleted Scenes,"special_features_Deleted Scenes,Behind the Scenes",special_features_Trailers,"special_features_Trailers,Behind the Scenes","special_features_Trailers,Commentaries","special_features_Trailers,Commentaries,Behind the Scenes","special_features_Trailers,Commentaries,Deleted Scenes","special_features_Trailers,Commentaries,Deleted Scenes,Behind the Scenes","special_features_Trailers,Deleted Scenes","special_features_Trailers,Deleted Scenes,Behind the Scenes"
477,610,7,0.99,129,Family,19.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
642,13,4,4.99,150,Horror,8.0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
131,59,4,2.99,160,Children,22.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
734,797,7,0.99,67,Music,16.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
220,525,4,0.99,140,Classics,26.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
data['rating_encoded'] = le.fit_transform(data['rating'])
le = LabelEncoder()
data['special_features'] = le.fit_transform(data['special_features'])
data.sample(5)

Unnamed: 0,film_id,rental_duration,rental_rate,length,rating,special_features,name,n_rentals,rating_encoded
990,872,3,2.99,185,R,5,Travel,15.0,4
776,490,4,2.99,121,G,2,New,8.0,0
555,641,4,0.99,76,PG-13,9,Foreign,24.0,3
427,907,4,0.99,168,PG-13,7,Drama,15.0,3
877,187,5,2.99,57,NC-17,1,Sports,11.0,1
