# Importing all Libraries

In [None]:
#pip install pymysql

In [1]:
import pymysql
from sqlalchemy import create_engine
import pandas as pd
import getpass  # To get the password without showing the input
password = getpass.getpass()

········


# SQL queries for data

In [2]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/sakila'
engine = create_engine(connection_string)
query = '''SELECT f.title AS 'film_title', COUNT(r.rental_date)
FROM sakila.film f
JOIN sakila.inventory i USING(film_id)
JOIN sakila.rental r USING(inventory_id)
JOIN sakila.film_category fc USING(film_id)
JOIN sakila.category c USING(category_id)
WHERE r.rental_date LIKE '%%2005-08%%'
GROUP BY f.title
ORDER BY f.title;'''

target = pd.read_sql_query(query, engine)
target.head()

Unnamed: 0,film_title,COUNT(r.rental_date)
0,ACADEMY DINOSAUR,9
1,ACE GOLDFINGER,4
2,ADAPTATION HOLES,6
3,AFFAIR PREJUDICE,6
4,AFRICAN EGG,3


# Adding column 'rented_aug'

In [3]:
target['rented_aug'] = 1


In [4]:
target.head()

Unnamed: 0,film_title,COUNT(r.rental_date),rented_aug
0,ACADEMY DINOSAUR,9,1
1,ACE GOLDFINGER,4,1
2,ADAPTATION HOLES,6,1
3,AFFAIR PREJUDICE,6,1
4,AFRICAN EGG,3,1


In [5]:
connection_string = 'mysql+pymysql://root:'+password+'@localhost/sakila'
engine = create_engine(connection_string)
query ='''SELECT f.title AS 'film_title', f.rental_rate, f.rental_duration, f.length, f.rating, f.special_features, c.name AS 'category'
FROM sakila.film f
JOIN sakila.film_category fc USING(film_id)
JOIN sakila.category c USING(category_id)
ORDER BY title;'''

features = pd.read_sql_query(query, engine)
features

Unnamed: 0,film_title,rental_rate,rental_duration,length,rating,special_features,category
0,ACADEMY DINOSAUR,0.99,6,86,PG,"Deleted Scenes,Behind the Scenes",Documentary
1,ACE GOLDFINGER,4.99,3,48,G,"Trailers,Deleted Scenes",Horror
2,ADAPTATION HOLES,2.99,7,50,NC-17,"Trailers,Deleted Scenes",Documentary
3,AFFAIR PREJUDICE,2.99,5,117,G,"Commentaries,Behind the Scenes",Horror
4,AFRICAN EGG,2.99,6,130,G,Deleted Scenes,Family
...,...,...,...,...,...,...,...
995,YOUNG LANGUAGE,0.99,6,183,G,"Trailers,Behind the Scenes",Documentary
996,YOUTH KICK,0.99,4,179,NC-17,"Trailers,Behind the Scenes",Music
997,ZHIVAGO CORE,0.99,6,105,NC-17,Deleted Scenes,Horror
998,ZOOLANDER FICTION,2.99,5,101,R,"Trailers,Deleted Scenes",Children


# Encoding catagorical columns 'rating' and 'category'

In [6]:
r = pd.get_dummies(features['rating'], prefix='rating')
r

Unnamed: 0,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R
0,0,0,1,0,0
1,1,0,0,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0
...,...,...,...,...,...
995,1,0,0,0,0
996,0,1,0,0,0
997,0,1,0,0,0
998,0,0,0,0,1


In [7]:
c = pd.get_dummies(features['category'], prefix='genre')
c

Unnamed: 0,genre_Action,genre_Animation,genre_Children,genre_Classics,genre_Comedy,genre_Documentary,genre_Drama,genre_Family,genre_Foreign,genre_Games,genre_Horror,genre_Music,genre_New,genre_Sci-Fi,genre_Sports,genre_Travel
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
998,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
encoded = pd.concat([c,r],axis=1)
encoded

Unnamed: 0,genre_Action,genre_Animation,genre_Children,genre_Classics,genre_Comedy,genre_Documentary,genre_Drama,genre_Family,genre_Foreign,genre_Games,...,genre_Music,genre_New,genre_Sci-Fi,genre_Sports,genre_Travel,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
998,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [9]:
full_data = pd.concat([features,encoded],axis=1)
full_data

Unnamed: 0,film_title,rental_rate,rental_duration,length,rating,special_features,category,genre_Action,genre_Animation,genre_Children,...,genre_Music,genre_New,genre_Sci-Fi,genre_Sports,genre_Travel,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R
0,ACADEMY DINOSAUR,0.99,6,86,PG,"Deleted Scenes,Behind the Scenes",Documentary,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,ACE GOLDFINGER,4.99,3,48,G,"Trailers,Deleted Scenes",Horror,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,ADAPTATION HOLES,2.99,7,50,NC-17,"Trailers,Deleted Scenes",Documentary,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,AFFAIR PREJUDICE,2.99,5,117,G,"Commentaries,Behind the Scenes",Horror,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,AFRICAN EGG,2.99,6,130,G,Deleted Scenes,Family,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,YOUNG LANGUAGE,0.99,6,183,G,"Trailers,Behind the Scenes",Documentary,0,0,0,...,0,0,0,0,0,1,0,0,0,0
996,YOUTH KICK,0.99,4,179,NC-17,"Trailers,Behind the Scenes",Music,0,0,0,...,1,0,0,0,0,0,1,0,0,0
997,ZHIVAGO CORE,0.99,6,105,NC-17,Deleted Scenes,Horror,0,0,0,...,0,0,0,0,0,0,1,0,0,0
998,ZOOLANDER FICTION,2.99,5,101,R,"Trailers,Deleted Scenes",Children,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [10]:
full_data = full_data.drop(['rating', 'category'], axis = 1)
full_data

Unnamed: 0,film_title,rental_rate,rental_duration,length,special_features,genre_Action,genre_Animation,genre_Children,genre_Classics,genre_Comedy,...,genre_Music,genre_New,genre_Sci-Fi,genre_Sports,genre_Travel,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R
0,ACADEMY DINOSAUR,0.99,6,86,"Deleted Scenes,Behind the Scenes",0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,ACE GOLDFINGER,4.99,3,48,"Trailers,Deleted Scenes",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,ADAPTATION HOLES,2.99,7,50,"Trailers,Deleted Scenes",0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,AFFAIR PREJUDICE,2.99,5,117,"Commentaries,Behind the Scenes",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,AFRICAN EGG,2.99,6,130,Deleted Scenes,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,YOUNG LANGUAGE,0.99,6,183,"Trailers,Behind the Scenes",0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
996,YOUTH KICK,0.99,4,179,"Trailers,Behind the Scenes",0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
997,ZHIVAGO CORE,0.99,6,105,Deleted Scenes,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
998,ZOOLANDER FICTION,2.99,5,101,"Trailers,Deleted Scenes",0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


# Deciding on Special Features

In [11]:
full_data['special_features'].isna().sum()

0

#### Will use Special Features if model neeeds improved

# Saving 'film_title' and 'special_features' for later use

In [12]:
title_sp_feat = full_data[['film_title','special_features']]
title_sp_feat

Unnamed: 0,film_title,special_features
0,ACADEMY DINOSAUR,"Deleted Scenes,Behind the Scenes"
1,ACE GOLDFINGER,"Trailers,Deleted Scenes"
2,ADAPTATION HOLES,"Trailers,Deleted Scenes"
3,AFFAIR PREJUDICE,"Commentaries,Behind the Scenes"
4,AFRICAN EGG,Deleted Scenes
...,...,...
995,YOUNG LANGUAGE,"Trailers,Behind the Scenes"
996,YOUTH KICK,"Trailers,Behind the Scenes"
997,ZHIVAGO CORE,Deleted Scenes
998,ZOOLANDER FICTION,"Trailers,Deleted Scenes"


# dropping 'film_title' and 'special_features'

In [13]:
full_data = full_data.drop(['film_title','special_features'], axis = 1)

In [14]:
full_data

Unnamed: 0,rental_rate,rental_duration,length,genre_Action,genre_Animation,genre_Children,genre_Classics,genre_Comedy,genre_Documentary,genre_Drama,...,genre_Music,genre_New,genre_Sci-Fi,genre_Sports,genre_Travel,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R
0,0.99,6,86,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,4.99,3,48,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,2.99,7,50,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,2.99,5,117,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2.99,6,130,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.99,6,183,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
996,0.99,4,179,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
997,0.99,6,105,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
998,2.99,5,101,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# Finding titles NOT rented in August 2005

In [15]:
all_titles = features['film_title']
all_titles
aug_titles = target['film_title']
aug_titles


0       ACADEMY DINOSAUR
1         ACE GOLDFINGER
2       ADAPTATION HOLES
3       AFFAIR PREJUDICE
4            AFRICAN EGG
             ...        
953       YOUNG LANGUAGE
954           YOUTH KICK
955         ZHIVAGO CORE
956    ZOOLANDER FICTION
957            ZORRO ARK
Name: film_title, Length: 958, dtype: object

In [16]:
lst_all_titles = []
for title in all_titles:
    lst_all_titles.append(title)
len(lst_all_titles)

1000

In [17]:
lst_aug_titles= []
for title in aug_titles:
    lst_aug_titles.append(title)
len(lst_aug_titles)

958

In [18]:
lst_0_aug = [item for item in lst_all_titles if item not in lst_aug_titles]

len(lst_0_aug)


42

# Adding the NOT rented in August to the target dataframe

In [19]:
df_0_aug = pd.DataFrame(lst_0_aug)

df_0_aug['COUNT(r.rental_date)'] = 0
df_0_aug['rented_aug'] = 0

df_0_aug = df_0_aug.rename(columns = {0:'Title'})
#df_new = df.rename(columns={'A': 'Col_1'}
df_0_aug.head()

Unnamed: 0,Title,COUNT(r.rental_date),rented_aug
0,ALICE FANTASIA,0,0
1,APOLLO TEEN,0,0
2,ARGONAUTS TOWN,0,0
3,ARK RIDGEMONT,0,0
4,ARSENIC INDEPENDENCE,0,0


In [20]:
target = target.rename(columns = {'film_title': 'Title'})
target.head()

Unnamed: 0,Title,COUNT(r.rental_date),rented_aug
0,ACADEMY DINOSAUR,9,1
1,ACE GOLDFINGER,4,1
2,ADAPTATION HOLES,6,1
3,AFFAIR PREJUDICE,6,1
4,AFRICAN EGG,3,1


In [21]:
target_all = pd.concat([target, df_0_aug], ignore_index=True)
target_all

Unnamed: 0,Title,COUNT(r.rental_date),rented_aug
0,ACADEMY DINOSAUR,9,1
1,ACE GOLDFINGER,4,1
2,ADAPTATION HOLES,6,1
3,AFFAIR PREJUDICE,6,1
4,AFRICAN EGG,3,1
...,...,...,...
995,TREASURE COMMAND,0,0
996,VILLAIN DESPERATE,0,0
997,VOLUME HOUSE,0,0
998,WAKE JAWS,0,0


# Sorting alphabetically

In [22]:
target_all = target_all.sort_values('Title').reset_index()
target_all

Unnamed: 0,index,Title,COUNT(r.rental_date),rented_aug
0,0,ACADEMY DINOSAUR,9,1
1,1,ACE GOLDFINGER,4,1
2,2,ADAPTATION HOLES,6,1
3,3,AFFAIR PREJUDICE,6,1
4,4,AFRICAN EGG,3,1
...,...,...,...,...
995,953,YOUNG LANGUAGE,3,1
996,954,YOUTH KICK,3,1
997,955,ZHIVAGO CORE,2,1
998,956,ZOOLANDER FICTION,7,1


In [23]:
target_all['rented_aug'].value_counts()

1    958
0     42
Name: rented_aug, dtype: int64

In [24]:
target_all

Unnamed: 0,index,Title,COUNT(r.rental_date),rented_aug
0,0,ACADEMY DINOSAUR,9,1
1,1,ACE GOLDFINGER,4,1
2,2,ADAPTATION HOLES,6,1
3,3,AFFAIR PREJUDICE,6,1
4,4,AFRICAN EGG,3,1
...,...,...,...,...
995,953,YOUNG LANGUAGE,3,1
996,954,YOUTH KICK,3,1
997,955,ZHIVAGO CORE,2,1
998,956,ZOOLANDER FICTION,7,1


In [25]:
target_column = target_all.drop(['Title','index'], axis = 1)
target_column

Unnamed: 0,COUNT(r.rental_date),rented_aug
0,9,1
1,4,1
2,6,1
3,6,1
4,3,1
...,...,...
995,3,1
996,3,1
997,2,1
998,7,1


# Combining both dataframes

In [26]:
model_data1 = pd.concat([full_data,target_column],axis=1)
model_data1

Unnamed: 0,rental_rate,rental_duration,length,genre_Action,genre_Animation,genre_Children,genre_Classics,genre_Comedy,genre_Documentary,genre_Drama,...,genre_Sci-Fi,genre_Sports,genre_Travel,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R,COUNT(r.rental_date),rented_aug
0,0.99,6,86,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,9,1
1,4.99,3,48,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,4,1
2,2.99,7,50,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,6,1
3,2.99,5,117,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,6,1
4,2.99,6,130,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.99,6,183,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,3,1
996,0.99,4,179,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,3,1
997,0.99,6,105,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,2,1
998,2.99,5,101,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,7,1


# Trying Logistic Regression first time

In [28]:
y = model_data1['rented_aug']
X = model_data1.drop(['rented_aug','COUNT(r.rental_date)'], axis = 1)
# dropping 'COUNT(r.rental_date)' because it is directly correlated to 'rented_aug'

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=75)
print('X_train shape is:',X_train.shape)
print('y_train shape is:', y_train.shape)
print('X_test shape is:', X_test.shape)
print('y_train shape is:', y_test.shape)

X_train shape is: (750, 24)
y_train shape is: (750,)
X_test shape is: (250, 24)
y_train shape is: (250,)


In [29]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
print('training set score:{:3f}'.format(logreg.score(X_train,y_train)))
print('test set score:{:3f}'.format(logreg.score(X_test,y_test)))

training set score:0.961333
test set score:0.948000


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## This pink warning message tells me I have to scale the numerical data

# Looking at confusion matrix

In [30]:
from sklearn.metrics import confusion_matrix

In [31]:
prediction = logreg.predict(X_test)

In [32]:
prediction

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [33]:
confusion_matrix(y_test, prediction)

array([[  0,  13],
       [  0, 237]])

# Scaling numerical columns

In [34]:
from sklearn.preprocessing import MinMaxScaler

In [35]:
scaler = MinMaxScaler()
#.fit(data= features, columns=['rental_rate','rental_duration','length'])

In [36]:
scaler.fit(features[['rental_rate','rental_duration','length']])

MinMaxScaler()

In [37]:
features_trans = scaler.transform(features[['rental_rate','rental_duration','length']])

In [38]:
features_trans
trans_feature = pd.DataFrame(features_trans)

In [39]:
trans_feature

Unnamed: 0,0,1,2
0,0.0,0.75,0.287770
1,1.0,0.00,0.014388
2,0.5,1.00,0.028777
3,0.5,0.50,0.510791
4,0.5,0.75,0.604317
...,...,...,...
995,0.0,0.75,0.985612
996,0.0,0.25,0.956835
997,0.0,0.75,0.424460
998,0.5,0.50,0.395683


In [40]:
scaled_features = pd.concat([model_data1, trans_feature], axis = 1)
scaled_features

Unnamed: 0,rental_rate,rental_duration,length,genre_Action,genre_Animation,genre_Children,genre_Classics,genre_Comedy,genre_Documentary,genre_Drama,...,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R,COUNT(r.rental_date),rented_aug,0,1,2
0,0.99,6,86,0,0,0,0,0,1,0,...,0,0,1,0,0,9,1,0.0,0.75,0.287770
1,4.99,3,48,0,0,0,0,0,0,0,...,1,0,0,0,0,4,1,1.0,0.00,0.014388
2,2.99,7,50,0,0,0,0,0,1,0,...,0,1,0,0,0,6,1,0.5,1.00,0.028777
3,2.99,5,117,0,0,0,0,0,0,0,...,1,0,0,0,0,6,1,0.5,0.50,0.510791
4,2.99,6,130,0,0,0,0,0,0,0,...,1,0,0,0,0,3,1,0.5,0.75,0.604317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.99,6,183,0,0,0,0,0,1,0,...,1,0,0,0,0,3,1,0.0,0.75,0.985612
996,0.99,4,179,0,0,0,0,0,0,0,...,0,1,0,0,0,3,1,0.0,0.25,0.956835
997,0.99,6,105,0,0,0,0,0,0,0,...,0,1,0,0,0,2,1,0.0,0.75,0.424460
998,2.99,5,101,0,0,1,0,0,0,0,...,0,0,0,0,1,7,1,0.5,0.50,0.395683


In [41]:
scaled_features = scaled_features.drop(['rental_rate','rental_duration','length'], axis = 1)

In [42]:
scaled_features = scaled_features.rename({0:'rental_rate',1:'rental_duration',2:'length'}, axis = 1)

In [43]:
scaled_features

Unnamed: 0,genre_Action,genre_Animation,genre_Children,genre_Classics,genre_Comedy,genre_Documentary,genre_Drama,genre_Family,genre_Foreign,genre_Games,...,rating_G,rating_NC-17,rating_PG,rating_PG-13,rating_R,COUNT(r.rental_date),rented_aug,rental_rate,rental_duration,length
0,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,9,1,0.0,0.75,0.287770
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,4,1,1.0,0.00,0.014388
2,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,6,1,0.5,1.00,0.028777
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,6,1,0.5,0.50,0.510791
4,0,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,3,1,0.5,0.75,0.604317
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,1,0,0,0,0,...,1,0,0,0,0,3,1,0.0,0.75,0.985612
996,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,3,1,0.0,0.25,0.956835
997,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,2,1,0.0,0.75,0.424460
998,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,7,1,0.5,0.50,0.395683


# All columns processed and scaled for use in Model

# Train, Test, Split

In [44]:
y = scaled_features['rented_aug']
X =scaled_features.drop(['rented_aug','COUNT(r.rental_date)'], axis = 1)

# train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=75)


In [45]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression().fit(X_train, y_train)
print('training set score:{:3f}'.format(logreg.score(X_train,y_train)))
print('test set score:{:3f}'.format(logreg.score(X_test,y_test)))

training set score:0.961333
test set score:0.948000


In [46]:
prediction = logreg.predict(X_test)

In [47]:
prediction

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

# Looking at confusion matrix

In [48]:
confusion_matrix(y_test, prediction)

array([[  0,  13],
       [  0, 237]])

#### This model seems too good to be true.  There is clearly an imbalance of data.  

# SMOTE for imbalanced data

In [49]:
#pip install imbalanced-learn

In [50]:
from imblearn.over_sampling import SMOTE

In [51]:
def over_sampling(training_x, training_y):

    smote = SMOTE(random_state = 100, k_neighbors = 3)
    X_train_scaled_SMOTE, y_train_SMOTE = smote.fit_resample(training_x, training_y)

    return X_train_scaled_SMOTE, y_train_SMOTE    

X_train_SMOTE, y_train_SMOTE = over_sampling(X_train, y_train)
X_test_SMOTE, y_test_SMOTE = over_sampling(X_test,y_test)

# Running logistic regression again on SMOTE data

In [52]:
logreg2 = LogisticRegression().fit(X_train_SMOTE, y_train_SMOTE)
print('training set score:{:3f}'.format(logreg.score(X_train_SMOTE,y_train_SMOTE)))
print('test set score:{:3f}'.format(logreg.score(X_test_SMOTE,y_test_SMOTE)))

training set score:0.500000
test set score:0.500000


In [53]:
prediction_SMOTE = logreg2.predict(X_test)

In [54]:
confusion_matrix(y_test, prediction_SMOTE)

array([[  0,  13],
       [  8, 229]])