In [360]:
import pandas as pd
import json
import statsmodels.formula.api as smf
import re
import statsmodels.regression.linear_model as sm


with open('result2.json') as f:
    data = json.load(f)

with open('studios.txt') as f:
    studios = []
    for s in f:
        studios.append(s.strip())

In [361]:
df = pd.DataFrame(data).drop(['download_latency', 'download_slot','download_timeout', 'depth'], axis=1)
df.shape

(2127, 15)

In [362]:
df = df.applymap(lambda x: np.nan if ((str(x).lower() == 'nan') or (str(x) == 'n/a')) else x)

In [417]:
# Genre
df.genre = df.genre.str.replace('-', '_')
genre_pre = df.set_index('title').genre.str.split(',', expand=True).stack().reset_index(level=1, drop=True).to_frame('genre')
dummy_genre = pd.get_dummies(genre_pre, prefix='g', columns=['genre']).groupby(level=0).sum().reset_index()
col = dummy_genre.columns.str.replace(' ', '')
dummy_genre.columns = col

In [418]:
# Language
df.language = df.language.str.replace(' ', '_')
language_pre = df.set_index('title').language.str.split(',', expand=True).stack().reset_index(level=1, drop=True).to_frame('language')
dummy_language = pd.get_dummies(language_pre, prefix='l', columns=['language']).groupby(level=0).sum().reset_index()

In [419]:
# MPAA
df.mpaa = df.mpaa.str.replace('-', '_')
df.mpaa = df.mpaa.str.replace('PG_13', 'PGthirt')
df.mpaa = df.mpaa.str.replace('NC_17', 'NC')
df[df.mpaa == 'PG_'].iloc[0].mpaa = 'PGthirt' # Fix an odd value

mpaa_pre = df.set_index('title').mpaa.str.split(',', expand=True).stack().reset_index(level=1, drop=True).to_frame('mpaa')
dummy_mpaa = pd.get_dummies(mpaa_pre, prefix='m', columns=['mpaa']).groupby(level=0).sum().reset_index()

In [420]:
# Directors
director_pre = df.set_index('title').director.str.split(',', expand=True).stack().reset_index(level=1, drop=True).to_frame('director')
dummy_director = pd.get_dummies(director_pre, prefix='d', columns=['director']).groupby(level=0).sum().reset_index()

In [421]:
# Country
df.country = df.country.str.replace(' ', '_')
country_pre = df.set_index('title').country.str.split(',', expand=True).stack().reset_index(level=1, drop=True).to_frame('country')
dummy_country = pd.get_dummies(country_pre, prefix='d', columns=['country']).groupby(level=0).sum().reset_index()

In [422]:
df_clean = df.merge(dummy_genre)
df_clean = df_clean.merge(dummy_language)
df_clean = df_clean.merge(dummy_mpaa)
df_clean = df_clean.merge(dummy_country)
#df_clean = df_clean.merge(dummy_director)
df_clean.shape

(2126, 181)

In [423]:
# USD conversions
# Euro 
# Pound (check year conversion) grab year and exchange rate

# Clean
df_clean.lifetime = df_clean.lifetime.str.replace('$', '').str.replace(',', '')
df_clean.opening = df_clean.opening.str.replace('$', '').str.replace(',', '')

df_clean.mpaa.unique()

# clean budget

array(['PGthirt', 'PG', 'R', 'NC', 'PG_'], dtype=object)

In [424]:
df_clean.rating = df_clean.rating.apply(float)
df_clean.runtime = df_clean.runtime.apply(float)


In [425]:
# Temporary, need to do currency conversions, or scrap from box office mojo
#df_clean.budget = df_clean.opening.str.replace('$', '')
#df_clean.budget = df_clean.opening.apply(lambda x: x if x.isdigit() else np.nan)
df_clean.budget[:5]

0     $20000000
1    $250000000
2      $6500000
3    $110000000
4    $108000000
Name: budget, dtype: object

In [426]:
# df_clean[df_clean.budget.apply(lambda x: str(x)[0]) != '$'].sort_values(by='budget')

In [427]:
#df_clean['currency'] = df_clean.budget.apply(lambda x: re.search(r'([^\d]+)([^(\s]+)', str(x))[1].strip() if (x is not np.nan) else np.nan)

In [428]:
df_clean_dropped = df_clean.dropna(how='any')
list(df_clean_dropped.columns)

['budget',
 'country',
 'director',
 'genre',
 'language',
 'lifetime',
 'mojo_url',
 'mpaa',
 'opening',
 'rating',
 'rating_count',
 'runtime',
 'title',
 'url',
 'year',
 'g_Action',
 'g_Adventure',
 'g_Animation',
 'g_Biography',
 'g_Comedy',
 'g_Crime',
 'g_Drama',
 'g_Family',
 'g_Fantasy',
 'g_History',
 'g_Horror',
 'g_Music',
 'g_Musical',
 'g_Mystery',
 'g_Romance',
 'g_Sci_Fi',
 'g_Sport',
 'g_Thriller',
 'g_War',
 'g_Western',
 'l_',
 'l_Aboriginal',
 'l_Acholi',
 'l_Afrikaans',
 'l_Albanian',
 'l_Algonquin',
 'l_American_Sign_Language',
 'l_Apache_languages',
 'l_Arabic',
 'l_Aramaic',
 'l_Armenian',
 'l_Bambara',
 'l_Bengali',
 'l_Berber_languages',
 'l_Bulgarian',
 'l_Burmese',
 'l_Cantonese',
 'l_Catalan',
 'l_Chinese',
 'l_Croatian',
 'l_Czech',
 'l_Danish',
 'l_Dari',
 'l_Dutch',
 'l_Egyptian_(Ancient)',
 'l_English',
 'l_Esperanto',
 'l_Estonian',
 'l_Filipino',
 'l_Finnish',
 'l_Flemish',
 'l_French',
 'l_Georgian',
 'l_German',
 'l_Greek',
 'l_Greenlandic',
 'l_Haw

In [429]:
X = df_clean_dropped.drop(['l__Ancient_(to_1453)','lifetime','budget','director', 'genre', 'language', 'mpaa', 'url', 'rating_count', 'mojo_url', 'title', 'country'], axis=1)

In [430]:
list(X.columns)
def generate_formula(y_str, x_l):
    result = '{} ~ '.format(y_str)
    for v in x_l:
        result += '{} + '.format(v)
    return result[:-2]

formula_name = generate_formula('lifetime', list(X.columns)).strip()
formula_name
lm1 = smf.ols(formula_name, data=X)
#fit1 = lm1.fit()

PatsyError: Error evaluating factor: NameError: name 'l_Egyptian_' is not defined
    lifetime ~ opening + rating + runtime + year + g_Action + g_Adventure + g_Animation + g_Biography + g_Comedy + g_Crime + g_Drama + g_Family + g_Fantasy + g_History + g_Horror + g_Music + g_Musical + g_Mystery + g_Romance + g_Sci_Fi + g_Sport + g_Thriller + g_War + g_Western + l_ + l_Aboriginal + l_Acholi + l_Afrikaans + l_Albanian + l_Algonquin + l_American_Sign_Language + l_Apache_languages + l_Arabic + l_Aramaic + l_Armenian + l_Bambara + l_Bengali + l_Berber_languages + l_Bulgarian + l_Burmese + l_Cantonese + l_Catalan + l_Chinese + l_Croatian + l_Czech + l_Danish + l_Dari + l_Dutch + l_Egyptian_(Ancient) + l_English + l_Esperanto + l_Estonian + l_Filipino + l_Finnish + l_Flemish + l_French + l_Georgian + l_German + l_Greek + l_Greenlandic + l_Hawaiian + l_Hebrew + l_Hindi + l_Hmong + l_Hungarian + l_Ibo + l_Icelandic + l_Indonesian + l_Irish + l_Italian + l_Japanese + l_Japanese_Sign_Language + l_Khmer + l_Kirundi + l_Klingon + l_Korean + l_Latin + l_Lingala + l_Mandarin + l_Maori + l_Maya + l_Mende + l_Mongolian + l_Navajo + l_None + l_North_American_Indian + l_Norwegian + l_Nyanja + l_Old_English + l_Panjabi + l_Pawnee + l_Persian + l_Polish + l_Portuguese + l_Pushto + l_Romanian + l_Romany + l_Russian + l_Sanskrit + l_Scottish_Gaelic + l_Serbian + l_Serbo_Croatian + l_Sign_Languages + l_Slovak + l_Somali + l_Southern_Sotho + l_Spanish + l_Swahili + l_Swedish + l_Swiss_German + l_Syriac + l_Tagalog + l_Tamil + l_Thai + l_Tibetan + l_Turkish + l_Ukrainian + l_Urdu + l_Vietnamese + l_Welsh + l_Xhosa + l_Yiddish + l_Zulu + m_NC + m_PG + m_PG_ + m_PGthirt + m_R + d_Algeria + d_Australia + d_Austria + d_Bahamas + d_Belgium + d_Brazil + d_Canada + d_Chile + d_China + d_Czech_Republic + d_Denmark + d_Finland + d_France + d_Georgia + d_Germany + d_Greece + d_Hong_Kong + d_Hungary + d_Iceland + d_India + d_Indonesia + d_Iran + d_Ireland + d_Israel + d_Italy + d_Japan + d_Kenya + d_Malta + d_Mexico + d_New_Zealand + d_Norway + d_Palestine + d_Panama + d_Romania + d_Russia + d_South_Africa + d_South_Korea + d_Spain + d_Sweden + d_UK + d_USA
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       ^^^^^^^^^^^^^^^^^^^^

In [431]:
sm.OLS(df_clean_dropped.lifetime, X)

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [432]:
list(df_clean.columns)

['budget',
 'country',
 'director',
 'genre',
 'language',
 'lifetime',
 'mojo_url',
 'mpaa',
 'opening',
 'rating',
 'rating_count',
 'runtime',
 'title',
 'url',
 'year',
 'g_Action',
 'g_Adventure',
 'g_Animation',
 'g_Biography',
 'g_Comedy',
 'g_Crime',
 'g_Drama',
 'g_Family',
 'g_Fantasy',
 'g_History',
 'g_Horror',
 'g_Music',
 'g_Musical',
 'g_Mystery',
 'g_Romance',
 'g_Sci_Fi',
 'g_Sport',
 'g_Thriller',
 'g_War',
 'g_Western',
 'l_',
 'l_Aboriginal',
 'l_Acholi',
 'l_Afrikaans',
 'l_Albanian',
 'l_Algonquin',
 'l_American_Sign_Language',
 'l_Apache_languages',
 'l_Arabic',
 'l_Aramaic',
 'l_Armenian',
 'l_Bambara',
 'l_Bengali',
 'l_Berber_languages',
 'l_Bulgarian',
 'l_Burmese',
 'l_Cantonese',
 'l_Catalan',
 'l_Chinese',
 'l_Croatian',
 'l_Czech',
 'l_Danish',
 'l_Dari',
 'l_Dutch',
 'l_Egyptian_(Ancient)',
 'l_English',
 'l_Esperanto',
 'l_Estonian',
 'l_Filipino',
 'l_Finnish',
 'l_Flemish',
 'l_French',
 'l_Georgian',
 'l_German',
 'l_Greek',
 'l_Greenlandic',
 'l_Haw

In [216]:
X = df_clean.drop(['url', ])

164      10001982
1175     10003827
100     100038390
418        100040
679         10020
282      10021215
1023       100268
111        100316
797        100847
265      10106872
883      10177257
446      10203437
827       1021768
516        102351
1231    102665981
940        102706
54       10278225
280      10283250
1146     10288932
145     103261464
143      10326356
488         10340
544         10367
1120     10408176
650         10422
76      104352905
129      10441000
1049     10470143
991        105005
816      10515659
          ...    
661           NaN
667           NaN
669           NaN
683           NaN
684           NaN
690           NaN
697           NaN
706           NaN
739           NaN
750           NaN
772           NaN
802           NaN
805           NaN
826           NaN
877           NaN
908           NaN
913           NaN
918           NaN
924           NaN
926           NaN
958           NaN
965           NaN
981           NaN
986           NaN
990       

In [26]:
ary = np.arange(0,100,0.01)

In [31]:
%%timeit
ary2 = ary*2 - 0.01

The slowest run took 24.38 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 11.4 µs per loop


In [30]:
ary3 =

array([ -1.00000000e-02,   1.00000000e-02,   3.00000000e-02, ...,
         1.99930000e+02,   1.99950000e+02,   1.99970000e+02])

In [67]:
import urllib.parse

In [69]:
urllib.parse.quote('This is the film (2009)')

'This%20is%20the%20film%20%282009%29'

In [106]:
l = [1,2,3,4]

In [107]:
l.split()

AttributeError: 'list' object has no attribute 'split'