In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import preprocessing
import pandas as pd
import numpy as np

# Obtain Data

In [3]:
table = pd.read_csv("http://www.simbrain.net/misc/movies_uncleaned.csv", header=-1) #grab data

In [4]:
table.head() # ctrl + '/'

Unnamed: 0,0,1,2,3,4,5
0,77000000.0,610000.0,9.2,60.0,6700.0,PG-13
1,63000000.0,720000.0,,65.0,7200.0,R
2,69000000.0,610000.0,9.466667,67.0,6900.0,PG
3,asdf,,,,,
4,64000000.0,720000.0,8.666667,65.0,7300.0,PG-13


In [5]:
table.columns = ["Budget", "Profit", "Average review", "Average age of viewer", "Number of Views", "Film-rating"] # renaming the columns

In [6]:
table.head() # ctrl + '/'

Unnamed: 0,Budget,Profit,Average review,Average age of viewer,Number of Views,Film-rating
0,77000000.0,610000.0,9.2,60.0,6700.0,PG-13
1,63000000.0,720000.0,,65.0,7200.0,R
2,69000000.0,610000.0,9.466667,67.0,6900.0,PG
3,asdf,,,,,
4,64000000.0,720000.0,8.666667,65.0,7300.0,PG-13


# Cleaning Data

In [7]:
table = table.drop(table.index[3])  #removed row '3'

In [8]:
table.head()

Unnamed: 0,Budget,Profit,Average review,Average age of viewer,Number of Views,Film-rating
0,77000000.0,610000.0,9.2,60.0,6700.0,PG-13
1,63000000.0,720000.0,,65.0,7200.0,R
2,69000000.0,610000.0,9.466667,67.0,6900.0,PG
4,64000000.0,720000.0,8.666667,65.0,7300.0,PG-13
5,77000000.0,610000.0,,60.0,7100.0,G


In [9]:
table = pd.get_dummies(table, columns=['Film-rating']) #reorganizing the film rating

In [10]:
table.head()

Unnamed: 0,Budget,Profit,Average review,Average age of viewer,Number of Views,Film-rating_G,Film-rating_PG,Film-rating_PG-13,Film-rating_R
0,77000000.0,610000.0,9.2,60.0,6700.0,0,0,1,0
1,63000000.0,720000.0,,65.0,7200.0,0,0,0,1
2,69000000.0,610000.0,9.466667,67.0,6900.0,0,1,0,0
4,64000000.0,720000.0,8.666667,65.0,7300.0,0,0,1,0
5,77000000.0,610000.0,,60.0,7100.0,1,0,0,0


# Impute Data

In [11]:
table = table.fillna(table.mean())

In [12]:
table.head()

Unnamed: 0,Budget,Profit,Average review,Average age of viewer,Number of Views,Film-rating_G,Film-rating_PG,Film-rating_PG-13,Film-rating_R
0,77000000.0,610000.0,9.2,60.0,6700.0,0,0,1,0
1,63000000.0,720000.0,8.973196,65.0,7200.0,0,0,0,1
2,69000000.0,610000.0,9.466667,67.0,6900.0,0,1,0,0
4,64000000.0,720000.0,8.666667,65.0,7300.0,0,0,1,0
5,77000000.0,610000.0,8.973196,60.0,7100.0,1,0,0,0


In [13]:
scaler = preprocessing.StandardScaler()
columns_to_rescale = ["Budget", "Profit", "Average review", "Average age of viewer", "Number of Views"]
table[columns_to_rescale] = scaler.fit_transform(table[columns_to_rescale]) #Rescaling Data

In [14]:
table

Unnamed: 0,Budget,Profit,Average review,Average age of viewer,Number of Views,Film-rating_G,Film-rating_PG,Film-rating_PG-13,Film-rating_R
0,1.692756,-0.844093,0.565954,-1.343420,-1.091740,0,0,1,0
1,-0.824583,1.214577,0.000000,-0.405489,0.255919,0,0,0,1
2,0.254277,-0.844093,1.231378,-0.030317,-0.552676,0,1,0,0
4,-0.644773,1.214577,-0.764895,-0.405489,0.525451,0,0,1,0
5,1.692756,-0.844093,0.000000,-1.343420,-0.013613,1,0,0,0
6,-0.824583,1.588880,-0.432183,-0.593075,0.255919,0,1,0,0
7,-1.184202,0.000000,-1.097607,1.470372,1.873109,0,1,0,0
8,-0.824583,0.653121,-1.097607,1.470372,1.603578,0,0,0,1
9,1.333136,-1.031244,0.565954,-0.968248,-0.822208,1,0,0,0
10,1.512946,-1.218396,-0.099471,-0.968248,-1.091740,0,0,0,1


# Export to Simbrain

In [None]:
table.to_csv("Data_wrangling_table.csv", header=False, index=False) #exporting as a csv file

In [15]:
table.head(10)

Unnamed: 0,Budget,Profit,Average review,Average age of viewer,Number of Views,Film-rating_G,Film-rating_PG,Film-rating_PG-13,Film-rating_R
0,1.692756,-0.844093,0.565954,-1.34342,-1.09174,0,0,1,0
1,-0.824583,1.214577,0.0,-0.405489,0.255919,0,0,0,1
2,0.254277,-0.844093,1.231378,-0.030317,-0.552676,0,1,0,0
4,-0.644773,1.214577,-0.764895,-0.405489,0.525451,0,0,1,0
5,1.692756,-0.844093,0.0,-1.34342,-0.013613,1,0,0,0
6,-0.824583,1.58888,-0.432183,-0.593075,0.255919,0,1,0,0
7,-1.184202,0.0,-1.097607,1.470372,1.873109,0,1,0,0
8,-0.824583,0.653121,-1.097607,1.470372,1.603578,0,0,0,1
9,1.333136,-1.031244,0.565954,-0.968248,-0.822208,1,0,0,0
10,1.512946,-1.218396,-0.099471,-0.968248,-1.09174,0,0,0,1


In [18]:
output_dataset = pd.DataFrame([[.01,.9],[.7,.2],[.1,.1]], 
   columns = ['Probability Child Will View', 'Probability Adult Will View'])

Unnamed: 0,Budget,Profit,Average review,Average age of viewer,Number of Views,Film-rating_G,Film-rating_PG,Film-rating_PG-13,Film-rating_R
0,1.692756,-0.844093,0.565954,-1.343420,-1.091740,0,0,1,0
1,-0.824583,1.214577,0.000000,-0.405489,0.255919,0,0,0,1
2,0.254277,-0.844093,1.231378,-0.030317,-0.552676,0,1,0,0
4,-0.644773,1.214577,-0.764895,-0.405489,0.525451,0,0,1,0
5,1.692756,-0.844093,0.000000,-1.343420,-0.013613,1,0,0,0
6,-0.824583,1.588880,-0.432183,-0.593075,0.255919,0,1,0,0
7,-1.184202,0.000000,-1.097607,1.470372,1.873109,0,1,0,0
8,-0.824583,0.653121,-1.097607,1.470372,1.603578,0,0,0,1
9,1.333136,-1.031244,0.565954,-0.968248,-0.822208,1,0,0,0
10,1.512946,-1.218396,-0.099471,-0.968248,-1.091740,0,0,0,1
