In [52]:
import pandas as pd
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import re
import datetime
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [53]:
# Bag of words 词袋模型 相当于one hot
# input: 演员那一列
def actors_countvectorize(df_actors):
    vectorizer = CountVectorizer(token_pattern='[^,]+')
    result = vectorizer.fit_transform(df_actors)
    print(result.shape)
    print(vectorizer.vocabulary_)
    print(vectorizer.get_feature_names())
    # 返回新生成的演员总数个列以及vectorizer（用于对测试集进行transform）
    return pd.DataFrame(result.toarray(), columns=["actors_" + x for x in vectorizer.get_feature_names()]), vectorizer


# Feature hashing
# 可以调整生成新的列的维度（n_features）
# input: 演员那一列
def actors_featurehashing(df_actors):
    vectorizer = HashingVectorizer(n_features=2**9, alternate_sign=False, norm=None)
    result = vectorizer.fit_transform(df_actors)
    print(result.shape)
    columns = ["actors_hash_val_{}".format(i) for i in range(result.shape[1])]
    # 返回新生成的2的9次方个列以及vectorizer
    return pd.DataFrame(result.toarray(), columns=columns), vectorizer


# Calculate the stars' value of each movie
# df：测试集， star_list： 下面的function得到的encode_map
def calculate_stars_value(df, star_list):
    df_temp = df.explode('stars')
    df_temp.loc[:, 'stars_value'] = df_temp['stars'].map(star_list)
    result = df_temp.groupby('movie')['stars_value'].sum()
    # 返回添加了star_value列的测试集
    return df.merge(result, on='movie', how='outer')


# Actors mean/sum encoding
# input: df-训练集， method-取mean或者sum，benchmark-某一列的名字，以gross为基准或者某个评分为基准？
def actors_mean_sum_encoding(df, method='mean', benchmark='gross'):
    df_temp = df.explode('stars')
    if method == 'mean':
        encode_map = df_temp.groupby('stars')[benchmark].mean()
    if method == 'sum':
        encode_map = df_temp.grouby('stars')[benchmark].sum()
    # encode_map: the total box office of each actor
    # 相当于一个map，key为演员名字，value为他的平均或总票房，需要用它替换测试集中的演员字段
    print(encode_map)
    # 返回添加了star_value列的训练集
    return calculate_stars_value(df, encode_map), encode_map


# Mean/sum encoding  用于director， production_company， country（这个要么用票房平均或者直接onehot？）
def mean_sum_encoding(df, column_name, method='mean', benchmark='gross'):
    if method == 'mean':
        encode_map = df.groupby(column_name)[benchmark].mean()
    if method == 'sum':
        encode_map = df.groupby(column_name)[benchmark].sum()
    print(encode_map)
    # 返回新的一列
    return df[column_name].map(encode_map), encode_map


# One-hot 用于content_rating 以及country?
def onehot_encoding(df, prefix, column_name):
    ohc = OneHotEncoder()
    result = ohc.fit_transform(df[column_name].values.reshape(-1, 1)).toarray()
    dfOneHot = pd.DataFrame(result, columns = [prefix + str(ohc.categories_[0][i]) for i in range(len(ohc.categories_[0]))])
    return pd.concat([df, dfOneHot], axis=1), ohc


# Process release_date
def release_date_process(df):
    days = []
    months = []
    date_pattern = re.compile(r"[0-9]+-[a-zA-Z]+-[0-9]+")
    date_pattern_2 = re.compile(r"[0-9]+-[a-zA-Z]+")
    for idx, row in df.iterrows():
        date_str = row['release_date']
        if date_pattern.match(date_str):
            date = datetime.datetime.strptime(date_str, '%d-%b-%y')
        elif date_pattern_2.match(date_str):
            date = datetime.datetime.strptime(date_str, '%d-%b')
        else:
            date = datetime.datetime(2020, 6, 30)
        months.append(date.month)
        days.append(date.day)
    df['month'] = months
    df['day'] = days
    return df


# Process runtime
# 去掉min
def runtime_process(df_runtime):
    return df_runtime.apply(lambda x: re.findall(r'\d+', x)[0] if len(re.findall(r'\d+', x)) > 0 else x)


# Genre countvectorize 不知道总数有多少，太多的话可以考虑用hashing或者也用平均票房？
def genre_countvectorize(df_genre):
    vectorizer = CountVectorizer(token_pattern='[^,]+')
    result = vectorizer.fit_transform(df_genre)
    print(result.shape)
    # 返回genre总数个列
    return pd.DataFrame(result.toarray(), columns=["genre_" + x for x in vectorizer.get_feature_names()]), vectorizer


# Frequency encoding  用某个category的出现频率来代替
def frequency_encoding(df, column_name):
    fe = df.groupby(column_name).size() / len(df)
    # 返回新的一列以及频率map
    return df[column_name].map(fe), fe

In [54]:
df = pd.read_csv('data/boxoffice_dataset.csv', index_col=0, keep_default_na=False)
df['stars'] = df['stars'].apply(lambda x: eval(x))
df = release_date_process(df)
df['runtime'] = runtime_process(df['runtime'])
df, rating_onehoter = onehot_encoding(df, prefix='rating_', column_name='content_rating')
df, country_onehoter = onehot_encoding(df, prefix='country_', column_name='country')
df_new, genre_vectorizer = genre_countvectorize(df['genre'])
df = pd.concat([df, df_new], axis=1)

(5950, 42)


In [55]:
# 这个生成的y_train和y_test不能直接用，后面改actors的特征的时候对数据集的顺序有影响，所以后面直接从训练集里把gross取出来当y
X_train, X_test, y_train, y_test = train_test_split(df, df['gross'], test_size=0.3)

In [56]:
X_train.head()

Unnamed: 0,movie,year,release_date,content_rating,runtime,genre,intro,director,stars,imdb,...,genre_horror,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western
1212,Burke and Hare,2010,9-Sep-11,R,91,"Comedy, Crime, History",A black comedy about two nineteenth century gr...,John Landis,"[Bill Bailey, Tom Wilkinson, Michael Smiley, T...",6.2,...,0,0,0,0,0,0,0,0,0,0
1806,Five Broken Cameras,2011,23-Jan-12,Not Rated,94,"Documentary, War",A documentary on a Palestinian farmer's chroni...,Emad Burnat,"[Guy Davidi, Emad Burnat, Soraya Burnat, Moham...",8.0,...,0,0,0,0,0,0,0,0,0,0
1864,All Together,2011,19-Oct-12,Not Rated,96,"Comedy, Drama",Five old friends decide to move in together as...,Stéphane Robelin,"[Guy Bedos, Daniel Brühl, Geraldine Chaplin, J...",6.7,...,0,0,0,0,0,0,0,0,0,0
748,OSS 117: Lost in Rio,2009,14-Jun-09,Not Rated,101,"Action, Adventure, Comedy",Another mission of world-known French secret a...,Michel Hazanavicius,"[Jean Dujardin, Louise Monot, Rüdiger Vogler, ...",6.9,...,0,0,0,0,0,0,0,0,0,0
2594,Enemy,2013,9-Jan-14,R,91,"Drama, Mystery, Thriller",A man seeks out his exact look-alike after spo...,Denis Villeneuve,"[Jake Gyllenhaal, Mélanie Laurent, Sarah Gadon...",6.9,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# Director
new_column, director_map = mean_sum_encoding(X_train, 'director')
X_train.loc[:, 'director'] = new_column

director
                             7.490556e+05
A.B. Shawky                  1.130800e+04
A.J. Eaton                   6.197320e+05
A.R. Murugadoss              2.430631e+06
A.T. White                   8.147000e+03
                                 ...     
Zoya Akhtar                  3.554920e+05
Álex de la Iglesia           3.527143e+05
Árni Ásgeirsson              7.781500e+04
Ása Helga Hjörleifsdóttir    8.980000e+02
Émile Gaudreault             4.518550e+06
Name: gross, Length: 3017, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [58]:
X_train

Unnamed: 0,movie,year,release_date,content_rating,runtime,genre,intro,director,stars,imdb,...,genre_horror,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western
1212,Burke and Hare,2010,9-Sep-11,R,91,"Comedy, Crime, History",A black comedy about two nineteenth century gr...,4.833000e+03,"[Bill Bailey, Tom Wilkinson, Michael Smiley, T...",6.2,...,0,0,0,0,0,0,0,0,0,0
1806,Five Broken Cameras,2011,23-Jan-12,Not Rated,94,"Documentary, War",A documentary on a Palestinian farmer's chroni...,1.099830e+05,"[Guy Davidi, Emad Burnat, Soraya Burnat, Moham...",8.0,...,0,0,0,0,0,0,0,0,0,0
1864,All Together,2011,19-Oct-12,Not Rated,96,"Comedy, Drama",Five old friends decide to move in together as...,4.318000e+04,"[Guy Bedos, Daniel Brühl, Geraldine Chaplin, J...",6.7,...,0,0,0,0,0,0,0,0,0,0
748,OSS 117: Lost in Rio,2009,14-Jun-09,Not Rated,101,"Action, Adventure, Comedy",Another mission of world-known French secret a...,1.494710e+07,"[Jean Dujardin, Louise Monot, Rüdiger Vogler, ...",6.9,...,0,0,0,0,0,0,0,0,0,0
2594,Enemy,2013,9-Jan-14,R,91,"Drama, Mystery, Thriller",A man seeks out his exact look-alike after spo...,4.947076e+07,"[Jake Gyllenhaal, Mélanie Laurent, Sarah Gadon...",6.9,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5600,God Bless the Broken Road,2018,7-Sep-18,PG,111,"Drama, Family, Romance","While grieving the loss of her husband, a fina...",2.264789e+07,"[Matthew Derek Davis, Arthur Cartwright, Adam ...",5.4,...,0,0,0,0,0,0,0,0,0,0
2550,The Great Gatsby,2013,10-May-13,PG-13,143,"Drama, Romance","A writer and wall street trader, Nick, finds h...",1.448404e+08,"[Leonardo DiCaprio, Carey Mulligan, Joel Edger...",7.2,...,0,0,0,0,0,0,0,0,0,0
884,Not Since You,2009,25-Oct-09,PG-13,90,"Drama, Romance",A romantic drama about a tight-knit group of c...,5.670000e+03,"[Desmond Harrington, Kathleen Robertson, Chris...",5.6,...,0,0,0,0,0,0,0,0,0,0
2026,Looper,2012,28-Sep-12,R,113,"Action, Crime, Drama","In 2074, when the mob wants to get rid of some...",3.500292e+07,"[Joseph Gordon-Levitt, Bruce Willis, Emily Blu...",7.4,...,0,0,0,0,0,0,0,0,0,0


In [59]:
df_new, stars_map = actors_mean_sum_encoding(X_train)
X_train = df_new

stars
                          7.490556e+05
'Freeway' Ricky Ross      1.500000e+06
50 Cent                   2.385995e+07
A.J. Benza                1.793000e+04
A.R. Bernard              1.200000e+06
                              ...     
Ólafur Darri Ólafsson     2.736766e+07
Óscar Jaenada             4.250250e+05
Óscar Zafra               2.776600e+04
Þorsteinn Bachmann        6.759600e+04
Þrúður Kristjánsdóttir    7.274300e+05
Name: gross, Length: 10262, dtype: float64


In [60]:
X_train

Unnamed: 0,movie,year,release_date,content_rating,runtime,genre,intro,director,stars,imdb,...,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western,stars_value
0,Burke and Hare,2010,9-Sep-11,R,91,"Comedy, Crime, History",A black comedy about two nineteenth century gr...,4.833000e+03,"[Bill Bailey, Tom Wilkinson, Michael Smiley, T...",6.2,...,0,0,0,0,0,0,0,0,0,2.181021e+07
1,Five Broken Cameras,2011,23-Jan-12,Not Rated,94,"Documentary, War",A documentary on a Palestinian farmer's chroni...,1.099830e+05,"[Guy Davidi, Emad Burnat, Soraya Burnat, Moham...",8.0,...,0,0,0,0,0,0,0,0,0,5.499150e+05
2,All Together,2011,19-Oct-12,Not Rated,96,"Comedy, Drama",Five old friends decide to move in together as...,4.318000e+04,"[Guy Bedos, Daniel Brühl, Geraldine Chaplin, J...",6.7,...,0,0,0,0,0,0,0,0,0,4.226675e+07
3,OSS 117: Lost in Rio,2009,14-Jun-09,Not Rated,101,"Action, Adventure, Comedy",Another mission of world-known French secret a...,1.494710e+07,"[Jean Dujardin, Louise Monot, Rüdiger Vogler, ...",6.9,...,0,0,0,0,0,0,0,0,0,8.499838e+06
4,Enemy,2013,9-Jan-14,R,91,"Drama, Mystery, Thriller",A man seeks out his exact look-alike after spo...,4.947076e+07,"[Jake Gyllenhaal, Mélanie Laurent, Sarah Gadon...",6.9,...,0,0,0,0,0,0,0,0,0,4.417227e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4160,God Bless the Broken Road,2018,7-Sep-18,PG,111,"Drama, Family, Romance","While grieving the loss of her husband, a fina...",2.264789e+07,"[Matthew Derek Davis, Arthur Cartwright, Adam ...",5.4,...,0,0,0,0,0,0,0,0,0,1.138503e+07
4161,The Great Gatsby,2013,10-May-13,PG-13,143,"Drama, Romance","A writer and wall street trader, Nick, finds h...",1.448404e+08,"[Leonardo DiCaprio, Carey Mulligan, Joel Edger...",7.2,...,0,0,0,0,0,0,0,0,0,2.572247e+08
4162,Not Since You,2009,25-Oct-09,PG-13,90,"Drama, Romance",A romantic drama about a tight-knit group of c...,5.670000e+03,"[Desmond Harrington, Kathleen Robertson, Chris...",5.6,...,0,0,0,0,0,0,0,0,0,2.192300e+04
4163,Looper,2012,28-Sep-12,R,113,"Action, Crime, Drama","In 2074, when the mob wants to get rid of some...",3.500292e+07,"[Joseph Gordon-Levitt, Bruce Willis, Emily Blu...",7.4,...,0,0,0,0,0,0,0,0,0,1.666278e+08


In [61]:
# Production_company
new_column, company_map = mean_sum_encoding(X_train, 'production_company')
X_train.loc[:, 'production_company'] = new_column

production_company
                        4.044839e+05
10 West Studios         1.830582e+06
100% Synthetic Films    3.181000e+03
1031 Films              9.873000e+03
108 Productions         3.826100e+04
                            ...     
isotopefilms            3.031000e+05
kNow Productions        1.344840e+05
micro_scope             2.958589e+06
nWave Pictures          7.411127e+06
thefyzz                 4.430719e+07
Name: gross, Length: 2406, dtype: float64


In [62]:
list(X_train)

['movie',
 'year',
 'release_date',
 'content_rating',
 'runtime',
 'genre',
 'intro',
 'director',
 'stars',
 'imdb',
 'gross',
 'metascore',
 'vote',
 'country',
 'budget',
 'worldwide_gross',
 'production_company',
 'id',
 'extracted_name',
 'avg_all',
 'avg_30',
 'max_30',
 'near_holiday',
 'month',
 'day',
 'rating_G',
 'rating_M',
 'rating_NC-17',
 'rating_Not Rated',
 'rating_PG',
 'rating_PG-13',
 'rating_R',
 'country_',
 'country_Afghanistan',
 'country_Argentina',
 'country_Australia',
 'country_Austria',
 'country_Bahamas',
 'country_Belarus',
 'country_Belgium',
 'country_Bosnia and Herzegovina',
 'country_Brazil',
 'country_Bulgaria',
 'country_Cambodia',
 'country_Canada',
 'country_Chile',
 'country_China',
 'country_Colombia',
 'country_Cuba',
 'country_Czech Republic',
 'country_Denmark',
 'country_Dominican Republic',
 'country_Egypt',
 'country_Estonia',
 'country_Ethiopia',
 'country_Finland',
 'country_France',
 'country_Georgia',
 'country_Germany',
 'country_Gre

In [63]:
features = list(X_train)
remove_list = ['movie', 'release_date', 'content_rating', 'genre', 'intro', 'stars', 'country', \
               'budget', 'worldwide_gross', 'id', 'extracted_name']
for item in remove_list:
    features.remove(item)

X_train = X_train[features]
X_train.to_csv('data/trainingset.csv')

In [64]:
X_train

Unnamed: 0,year,runtime,director,imdb,gross,metascore,vote,production_company,avg_all,avg_30,...,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western,stars_value
0,2010,91,4.833000e+03,6.2,4833,46,18890,1.330578e+06,,,...,0,0,0,0,0,0,0,0,0,2.181021e+07
1,2011,94,1.099830e+05,8.0,109983,78,5609,1.099830e+05,,,...,0,0,0,0,0,0,0,0,0,5.499150e+05
2,2011,96,4.318000e+04,6.7,43180,57,2724,4.318000e+04,,,...,0,0,0,0,0,0,0,0,0,4.226675e+07
3,2009,101,1.494710e+07,6.9,87353,58,12370,2.977544e+05,,,...,0,0,0,0,0,0,0,0,0,8.499838e+06
4,2013,91,4.947076e+07,6.9,1007088,61,154010,6.622030e+06,,,...,0,0,0,0,0,0,0,0,0,4.417227e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4160,2018,111,2.264789e+07,5.4,2846257,31,752,1.830582e+06,0,0,...,0,0,0,0,0,0,0,0,0,1.138503e+07
4161,2013,143,1.448404e+08,7.2,144840419,55,453842,1.208209e+08,,,...,0,0,0,0,0,0,0,0,0,2.572247e+08
4162,2009,90,5.670000e+03,5.6,5670,,1729,5.670000e+03,,,...,0,0,0,0,0,0,0,0,0,2.192300e+04
4163,2012,113,3.500292e+07,7.4,66486205,84,516715,2.402970e+07,,,...,0,0,0,0,0,0,0,0,0,1.666278e+08


In [65]:
y_train = X_train['gross']

In [66]:
y_train

0            4833
1          109983
2           43180
3           87353
4         1007088
          ...    
4160      2846257
4161    144840419
4162         5670
4163     66486205
4164        64437
Name: gross, Length: 4165, dtype: int64

In [67]:
X_train.drop('gross',axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [68]:
X_train

Unnamed: 0,year,runtime,director,imdb,metascore,vote,production_company,avg_all,avg_30,max_30,...,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western,stars_value
0,2010,91,4.833000e+03,6.2,46,18890,1.330578e+06,,,,...,0,0,0,0,0,0,0,0,0,2.181021e+07
1,2011,94,1.099830e+05,8.0,78,5609,1.099830e+05,,,,...,0,0,0,0,0,0,0,0,0,5.499150e+05
2,2011,96,4.318000e+04,6.7,57,2724,4.318000e+04,,,,...,0,0,0,0,0,0,0,0,0,4.226675e+07
3,2009,101,1.494710e+07,6.9,58,12370,2.977544e+05,,,,...,0,0,0,0,0,0,0,0,0,8.499838e+06
4,2013,91,4.947076e+07,6.9,61,154010,6.622030e+06,,,,...,0,0,0,0,0,0,0,0,0,4.417227e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4160,2018,111,2.264789e+07,5.4,31,752,1.830582e+06,0,0,0,...,0,0,0,0,0,0,0,0,0,1.138503e+07
4161,2013,143,1.448404e+08,7.2,55,453842,1.208209e+08,,,,...,0,0,0,0,0,0,0,0,0,2.572247e+08
4162,2009,90,5.670000e+03,5.6,,1729,5.670000e+03,,,,...,0,0,0,0,0,0,0,0,0,2.192300e+04
4163,2012,113,3.500292e+07,7.4,84,516715,2.402970e+07,,,,...,0,0,0,0,0,0,0,0,0,1.666278e+08


In [69]:
X_test.head()

Unnamed: 0,movie,year,release_date,content_rating,runtime,genre,intro,director,stars,imdb,...,genre_horror,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western
3747,A Bigger Splash,2015,13-May-16,R,125,"Drama, Thriller",The vacation of a famous rock star and her boy...,Luca Guadagnino,"[Tilda Swinton, Matthias Schoenaerts, Ralph Fi...",6.4,...,0,0,0,0,0,0,0,0,0,0
4948,Journey to the West: The Demons Strike Back,2017,3-Feb-17,PG-13,109,"Adventure, Comedy, Family",A monk and his three disciples continue on the...,Hark Tsui,"[Kris Wu, Kenny Lin, Chen Yao, Yun Lin]",5.4,...,0,0,0,0,0,0,0,0,0,0
352,Filth and Wisdom,2008,18-Jun-08,Not Rated,84,"Comedy, Drama, Music",A comedy centered on three flatmates living de...,Madonna,"[Eugene Hutz, Holly Weston, Vicky McClure, Ric...",5.6,...,0,0,0,0,0,0,0,0,0,0
4087,Jasmine,2015,12-May-17,Not Rated,80,Thriller,"A year after his wife's murder, once-successfu...",Dax Phelan,"[Jason Tobin, Byron Mann, Sarah Lian, Eugenia ...",5.7,...,0,0,0,0,0,0,0,0,1,0
4163,Allegiant,2016,18-Mar-16,PG-13,120,"Action, Adventure, Mystery",After the earth-shattering revelations of Insu...,Robert Schwentke,"[Shailene Woodley, Theo James, Jeff Daniels, N...",5.7,...,0,0,0,0,0,0,0,0,0,0


In [70]:
X_test.loc[:, 'director'] = X_test['director'].map(director_map)

In [71]:
X_test

Unnamed: 0,movie,year,release_date,content_rating,runtime,genre,intro,director,stars,imdb,...,genre_horror,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western
3747,A Bigger Splash,2015,13-May-16,R,125,"Drama, Thriller",The vacation of a famous rock star and her boy...,8.525070e+06,"[Tilda Swinton, Matthias Schoenaerts, Ralph Fi...",6.4,...,0,0,0,0,0,0,0,0,0,0
4948,Journey to the West: The Demons Strike Back,2017,3-Feb-17,PG-13,109,"Adventure, Comedy, Family",A monk and his three disciples continue on the...,1.623477e+05,"[Kris Wu, Kenny Lin, Chen Yao, Yun Lin]",5.4,...,0,0,0,0,0,0,0,0,0,0
352,Filth and Wisdom,2008,18-Jun-08,Not Rated,84,"Comedy, Drama, Music",A comedy centered on three flatmates living de...,5.834550e+05,"[Eugene Hutz, Holly Weston, Vicky McClure, Ric...",5.6,...,0,0,0,0,0,0,0,0,0,0
4087,Jasmine,2015,12-May-17,Not Rated,80,Thriller,"A year after his wife's murder, once-successfu...",,"[Jason Tobin, Byron Mann, Sarah Lian, Eugenia ...",5.7,...,0,0,0,0,0,0,0,0,1,0
4163,Allegiant,2016,18-Mar-16,PG-13,120,"Action, Adventure, Mystery",After the earth-shattering revelations of Insu...,9.038016e+07,"[Shailene Woodley, Theo James, Jeff Daniels, N...",5.7,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2192,The Angels' Share,2012,9-Feb-13,R,101,"Comedy, Crime, Drama","Narrowly avoiding jail, new dad Robbie vows to...",5.580400e+04,"[Paul Brannigan, John Henshaw, Roger Allam, Ga...",7.0,...,0,0,0,0,0,0,0,0,0,0
479,Dolphins and Whales 3D: Tribes of the Ocean,2008,15-Feb-08,Not Rated,42,"Documentary, Short, Adventure",This documentary goes to coral reefs of the Ba...,,"[Daryl Hannah, Charlotte Rampling]",6.3,...,0,0,0,0,0,0,0,0,0,0
3059,Witch Graveyard,2013,14-Jun-13,Not Rated,70,"Fantasy, Horror",Four travelers innocently decide to rest in a ...,,"[Catherine Franklin, Reuben Rox, Rachel Wise, ...",6.9,...,0,0,0,0,0,0,0,0,0,0
3517,I nostri ragazzi,2014,3-May-15,Not Rated,92,Drama,Tensions between two brothers and their famili...,,"[Alessandro Gassmann, Giovanna Mezzogiorno, Lu...",6.6,...,0,0,0,0,0,0,0,0,0,0


In [72]:
X_test.loc[:, 'production_company'] = X_test['production_company'].map(company_map)

In [73]:
X_test = calculate_stars_value(X_test, stars_map)

In [74]:
X_test

Unnamed: 0,movie,year,release_date,content_rating,runtime,genre,intro,director,stars,imdb,...,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western,stars_value
0,A Bigger Splash,2015,13-May-16,R,125,"Drama, Thriller",The vacation of a famous rock star and her boy...,8.525070e+06,"[Tilda Swinton, Matthias Schoenaerts, Ralph Fi...",6.4,...,0,0,0,0,0,0,0,0,0,1.362901e+08
1,Journey to the West: The Demons Strike Back,2017,3-Feb-17,PG-13,109,"Adventure, Comedy, Family",A monk and his three disciples continue on the...,1.623477e+05,"[Kris Wu, Kenny Lin, Chen Yao, Yun Lin]",5.4,...,0,0,0,0,0,0,0,0,0,3.776627e+06
2,Filth and Wisdom,2008,18-Jun-08,Not Rated,84,"Comedy, Drama, Music",A comedy centered on three flatmates living de...,5.834550e+05,"[Eugene Hutz, Holly Weston, Vicky McClure, Ric...",5.6,...,0,0,0,0,0,0,0,0,0,1.684187e+07
3,Jasmine,2015,12-May-17,Not Rated,80,Thriller,"A year after his wife's murder, once-successfu...",,"[Jason Tobin, Byron Mann, Sarah Lian, Eugenia ...",5.7,...,0,0,0,0,0,0,0,1,0,0.000000e+00
4,Allegiant,2016,18-Mar-16,PG-13,120,"Action, Adventure, Mystery",After the earth-shattering revelations of Insu...,9.038016e+07,"[Shailene Woodley, Theo James, Jeff Daniels, N...",5.7,...,0,0,0,0,0,0,0,0,0,1.766970e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780,The Angels' Share,2012,9-Feb-13,R,101,"Comedy, Crime, Drama","Narrowly avoiding jail, new dad Robbie vows to...",5.580400e+04,"[Paul Brannigan, John Henshaw, Roger Allam, Ga...",7.0,...,0,0,0,0,0,0,0,0,0,5.757500e+05
1781,Dolphins and Whales 3D: Tribes of the Ocean,2008,15-Feb-08,Not Rated,42,"Documentary, Short, Adventure",This documentary goes to coral reefs of the Ba...,,"[Daryl Hannah, Charlotte Rampling]",6.3,...,0,0,0,0,0,0,0,0,0,1.631491e+07
1782,Witch Graveyard,2013,14-Jun-13,Not Rated,70,"Fantasy, Horror",Four travelers innocently decide to rest in a ...,,"[Catherine Franklin, Reuben Rox, Rachel Wise, ...",6.9,...,0,0,0,0,0,0,0,0,0,0.000000e+00
1783,I nostri ragazzi,2014,3-May-15,Not Rated,92,Drama,Tensions between two brothers and their famili...,,"[Alessandro Gassmann, Giovanna Mezzogiorno, Lu...",6.6,...,0,0,0,0,0,0,0,0,0,6.253380e+05


In [75]:
X_test = X_test[features]
X_test.to_csv('data/testingset.csv')

In [76]:
X_test

Unnamed: 0,year,runtime,director,imdb,gross,metascore,vote,production_company,avg_all,avg_30,...,genre_music,genre_musical,genre_mystery,genre_romance,genre_sci-fi,genre_short,genre_sport,genre_thriller,genre_western,stars_value
0,2015,125,8.525070e+06,6.4,1982505,74,25129,1.028487e+07,0.033742331,0.25,...,0,0,0,0,0,0,0,0,0,1.362901e+08
1,2017,109,1.623477e+05,5.4,880346,59,2522,,0.009202454,0,...,0,0,0,0,0,0,0,0,0,3.776627e+06
2,2008,84,5.834550e+05,5.6,22406,26,2364,5.834550e+05,0,0,...,0,0,0,0,0,0,0,0,0,1.684187e+07
3,2015,80,,5.7,18373,,125,,0.693251534,0.5,...,0,0,0,0,0,0,0,1,0,0.000000e+00
4,2016,120,9.038016e+07,5.7,66184051,33,101853,2.992643e+08,1.058282209,19.5,...,0,0,0,0,0,0,0,0,0,1.766970e+08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1780,2012,101,5.580400e+04,7.0,304357,66,22519,1.045124e+07,,,...,0,0,0,0,0,0,0,0,0,5.757500e+05
1781,2008,42,,6.3,7518876,,230,,0,0,...,0,0,0,0,0,0,0,0,0,1.631491e+07
1782,2013,70,,6.9,9554,,330,,,,...,0,0,0,0,0,0,0,0,0,0.000000e+00
1783,2014,92,,6.6,6335,,1369,,0,0,...,0,0,0,0,0,0,0,0,0,6.253380e+05
