https://machinelearningmastery.com/neural-network-models-for-combined-classification-and-regression/

In [7]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score

pd.options.mode.chained_assignment = None #remove setting with copy warning

dfMovies = pd.read_csv('data/IMDb movies.csv', dtype={"year": str})
dfNames = pd.read_csv('data/IMDb names.csv')
dfRatings = pd.read_csv('data/IMDb ratings.csv')
dfTitlePrincipals = pd.read_csv('data/IMDb title_principals.csv')

# just run this cell once to load data into memory


In [17]:
from tensorflow.keras.utils import to_categorical
import keras
# dfMovies = dfMovies[['imdb_title_id', 'title', 'year','genre', 'duration', 'country', 'language', 'director', 'writer', 'production_company', 'budget', 'worlwide_gross_income']]
# dfNames = dfNames[['imdb_name_id', 'name']]
# dfRatings = dfRatings[['imdb_title_id', 'weighted_average_vote', 'total_votes', 'mean_vote' ]] # come back to this and include age groups/male/female votes
# dfTitlePrincipals = dfTitlePrincipals[['imdb_title_id', 'ordering', 'imdb_name_id']] # shouyld we get category? "actress"/"actor" (dont know if its necessary)


#one hot encode into top 10 features"
# country, language, director, writer, produciton_company, actor names

dfMovies = dfMovies[['imdb_title_id', 'year','duration', 'worlwide_gross_income', 'genre', 'language', 'country']]



oneHotGenres = dfMovies['genre'].str.get_dummies(', ')
oneHotCountries = dfMovies['country'].str.get_dummies(', ')
oneHotLanguages = dfMovies['language'].str.get_dummies(', ')
oneHotYears = dfMovies['year'].str.get_dummies() # one hot encoding of categorical features
data = pd.concat([dfMovies, oneHotGenres,oneHotLanguages, oneHotCountries, oneHotYears], axis=1)
dfRatings = dfRatings[['imdb_title_id', 'weighted_average_vote' ]] # come back to this and include age groups/male/female votes


data = pd.merge(data, dfRatings, on=["imdb_title_id"]) #@Jayden, this is sql equivalent inner join on imdb_title_id
data = data.drop('imdb_title_id', axis=1)

data = data.dropna()

data['worlwide_gross_income'].replace(to_replace=r'^.*\ ', value='', regex=True, inplace=True) # replace cells with vale of 0

X = data.iloc[:, :-1]
X.drop(['genre', 'language', 'country', 'year'], axis=1, inplace=True)
y = data.iloc[:, -1]

X = X.astype(float)
y = y.astype(float)

X = X.loc[:, ~X.columns.duplicated()]
print(data.describe())


X.to_csv('test.csv', index=False)



           duration        Action    Adult     Adventure     Animation  \
count  30856.000000  30856.000000  30856.0  30856.000000  30856.000000   
mean     105.279654      0.159645      0.0      0.098393      0.042844   
std       20.429065      0.366282      0.0      0.297849      0.202509   
min       41.000000      0.000000      0.0      0.000000      0.000000   
25%       92.000000      0.000000      0.0      0.000000      0.000000   
50%      101.000000      0.000000      0.0      0.000000      0.000000   
75%      114.000000      0.000000      0.0      0.000000      0.000000   
max      808.000000      1.000000      0.0      1.000000      1.000000   

          Biography       Comedy         Crime   Documentary         Drama  \
count  30856.000000  30856.00000  30856.000000  30856.000000  30856.000000   
mean       0.043330      0.38926      0.133912      0.000032      0.587503   
std        0.203603      0.48759      0.340564      0.005693      0.492292   
min        0.000000  

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7)





# define the keras model
model = Sequential()
model.add(Dense(784, input_dim=X.shape[1], activation='relu', kernel_initializer='he_normal'))
model.add(Dense(128, activation='sigmoid', kernel_initializer='he_normal'))
model.add(Dense(10, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='linear'))

model.compile(loss='mse', optimizer='adam', metrics=['mse', 'mae'])

model.fit(X_train, y_train, epochs=10, batch_size=784)


yhat = model.predict(X_test)
error = mean_absolute_error(y_test, yhat)
print('MAE: %.3f' % error)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
MAE: 0.843
[6.0871296]
6.3
[[6.087129]]
       duration  worlwide_gross_income  Action  Adult  Adventure  Animation  \
76327      94.0               202788.0     0.0    0.0        0.0        0.0   

       Biography  Comedy  Crime  Documentary  ...  2012  2013  2014  2015  \
76327        0.0     1.0    0.0          0.0  ...   0.0   0.0   0.0   0.0   

       2016  2017  2018  2019  2020  TV Movie 2019  
76327   0.0   1.0   0.0   0.0   0.0            0.0  

[1 rows x 601 columns]
