# Importing all Libraries

In [None]:
#pip install pymysql

In [None]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import getpass  # To get the password without showing the input
password = getpass.getpass()

# SQL queries for data

In [None]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/sakila'
engine = create_engine(connection_string)
query = '''SELECT f.title AS 'film_title', COUNT(r.rental_date)
FROM sakila.film f
JOIN sakila.inventory i USING(film_id)
JOIN sakila.rental r USING(inventory_id)
JOIN sakila.film_category fc USING(film_id)
JOIN sakila.category c USING(category_id)
WHERE r.rental_date LIKE '%%2005-08%%'
GROUP BY f.title
ORDER BY f.title;'''

target = pd.read_sql_query(query, engine)
target.head()

# Adding column 'rented_aug'

In [None]:
target['rented_aug'] = 1


In [None]:
target.head()

In [None]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/sakila'
engine = create_engine(connection_string)
query ='''SELECT f.title AS 'film_title', f.rental_rate, f.rental_duration, f.length, f.rating, f.special_features, c.name AS 'category'
FROM sakila.film f
JOIN sakila.film_category fc USING(film_id)
JOIN sakila.category c USING(category_id)
ORDER BY title;'''

features = pd.read_sql_query(query, engine)
features

# Scaling numerical columns

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()
#.fit(data= features, columns=['rental_rate','rental_duration','length'])

In [None]:
scaler.fit(features[['rental_rate','rental_duration','length']])

In [None]:
features_trans = scaler.transform(features[['rental_rate','rental_duration','length']])

In [None]:
features_trans
trans_feature = pd.DataFrame(features_trans)

In [None]:
trans_feature

In [None]:
scaled_features = pd.concat([features, trans_feature], axis = 1)
scaled_features

In [None]:
# Dropping 'rental_rate', 'rental_duration', and 'length' now that they are scaled
scale_features = scaled_features.drop(['rental_rate', 'rental_duration','length'], axis = 1)
scale_features

In [None]:
scale_features = scale_features.rename({0:'rental_rate',1:'rental_duration',2:'length'}, axis = 1)

In [None]:
scale_features

In [None]:
target.shape

In [None]:
scale_features.shape

# Finding titles NOT rented in August 2005

In [None]:
all_titles = scale_features['film_title']
all_titles
aug_titles = target['film_title']
aug_titles

In [None]:
lst_all_titles = []
for title in all_titles:
    lst_all_titles.append(title)
len(lst_all_titles)

In [None]:
lst_aug_titles= []
for title in aug_titles:
    lst_aug_titles.append(title)
len(lst_aug_titles)

In [None]:
lst_0_aug = [item for item in lst_all_titles if item not in lst_aug_titles]

len(lst_0_aug)


# Adding the NOT rented in August to the target dataframe

In [None]:
df_0_aug = pd.DataFrame(lst_0_aug)

df_0_aug['COUNT(r.rental_date)'] = 0
df_0_aug['rented_aug'] = 0

df_0_aug = df_0_aug.rename(columns = {0:'Title'})
#df_new = df.rename(columns={'A': 'Col_1'}
df_0_aug.head()

In [None]:
target = target.rename(columns = {'film_title': 'Title'})
target.head()

In [None]:
target_all = pd.concat([target, df_0_aug], ignore_index=True)
target_all

# Sorting alphabetically

In [None]:
target_all = target_all.sort_values('Title').reset_index()
target_all

In [None]:
target_all['rented_aug'].value_counts()

In [None]:
target_all = target_all.drop('index', axis = 1)
target_all

# Combining both dataframes

In [None]:
full_data = pd.concat([scale_features,target_all],axis=1)
full_data

In [None]:
full_data['rented_aug'].value_counts()

In [None]:
full_data = full_data.drop(('Title'),axis=1)


In [None]:
full_data

# Encoding catagorical columns 'rating' and 'category'

In [None]:
r = pd.get_dummies(full_data['rating'], prefix='rating')
r

In [None]:
c = pd.get_dummies(full_data['category'], prefix='genre')
c

# Deciding on Special Features

In [None]:
full_data['special_features'].isna().sum()

#### Will use Special Features if model neeeds improved

## Combining encoded columns to dataframe

In [None]:
encoded = pd.concat([c,r],axis=1)
encoded

In [None]:
full_data_encoded = pd.concat([full_data,encoded],axis=1)
full_data_encoded

# Dropping 'rating' and 'category' columns

In [None]:
full_data_encoded = full_data_encoded.drop(['rating','category'],axis = 1)
full_data_encoded

# Saving 'film_title' and 'special_features' for later use

In [None]:
title_sp_feat = full_data_encoded[['film_title','special_features']]
title_sp_feat

In [None]:
model_data = full_data_encoded.drop(['film_title','special_features'],axis = 1)
model_data

# All columns processed for use in Model

# Train, Test, Split

In [None]:
y = model_data['rented_aug']
X = model_data.drop('rented_aug', axis = 1)

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=75)


In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
print('training set score:{:3f}'.format(logreg.score(X_train,y_train)))
print('test set score:{:3f}'.format(logreg.score(X_test,y_test)))

In [None]:
prediction = logreg.predict(X_test)

In [None]:
prediction

# Looking at confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, prediction)

# SMOTE for imbalanced data

In [None]:
#pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
def over_sampling(training_x, training_y):

    smote = SMOTE(random_state = 100, k_neighbors = 3)
    X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_resample(training_x, training_y)

    return X_train_scaled_SMOTE, y_train_SMOTE    

X_train_SMOTE, y_train_SMOTE = over_sampling(X_train, y_train)

# Running logistic regression again on SMOTE data

In [None]:
logreg2 = LogisticRegression().fit(X_train_SMOTE, y_train_SMOTE)
print('training set score:{:3f}'.format(logreg.score(X_train_SMOTE,y_train_SMOTE)))
print('test set score:{:3f}'.format(logreg.score(X_test,y_test)))

In [None]:
prediction_SMOTE = logreg2.predict(X_test)

In [None]:
confusion_matrix(y_test, prediction_SMOTE)