In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import statsmodels
from collections import Counter
import cpi
import ast
import json
import datetime
from sklearn.decomposition import PCA
from collections import Counter
from wordcloud import WordCloud, STOPWORDS 

In [2]:
def text_to_dict(df):
    dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x) )
    return df

def classify_movie_budget(budget):
    if budget == 0:
        return 'no data'
    elif budget < 400000:
        return 'micro-budget'
    elif budget < 2000000:
        return 'low-budget'
    elif budget < 10000000:
        return 'middle-budget'
    else:
        return 'high-budget'

In [3]:
dataset_original = pd.read_csv('./tmdb-box-office-prediction/train.csv')
dataset_original.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 23 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     3000 non-null   int64  
 1   belongs_to_collection  604 non-null    object 
 2   budget                 3000 non-null   int64  
 3   genres                 2993 non-null   object 
 4   homepage               946 non-null    object 
 5   imdb_id                3000 non-null   object 
 6   original_language      3000 non-null   object 
 7   original_title         3000 non-null   object 
 8   overview               2992 non-null   object 
 9   popularity             3000 non-null   float64
 10  poster_path            2999 non-null   object 
 11  production_companies   2844 non-null   object 
 12  production_countries   2945 non-null   object 
 13  release_date           3000 non-null   object 
 14  runtime                2998 non-null   float64
 15  spok

In [4]:
dataset = dataset_original.copy()
dataset['cast'] = dataset['cast'].apply(lambda x: {} if pd.isna(x) else [x.get('name') for x in ast.literal_eval(x)])


def fix_date(date):
    x = pd.to_datetime(date, format="%m/%d/%y")
    if x.year > 2020:
        year = x.year - 100
    else:
        year = x.year
    return datetime.datetime(year,x.month,x.day)

dataset["release_date"] = dataset_original["release_date"].apply(fix_date)

def adjust_price_to_inflation(price, date):
    return int(cpi.inflate(price, date.year))

dataset["budget"] = dataset.apply(lambda x: adjust_price_to_inflation(x["budget"], x["release_date"]), axis=1)
dataset["revenue"] = dataset.apply(lambda x: adjust_price_to_inflation(x["revenue"], x["release_date"]), axis=1)

In [5]:
def map_list_to_integer(lists):
    a = {}
    for l in lists:
        for name in l:
            if(name not in a):
                a[name] = len(a)
    return a

In [6]:
# cast_mapping = map_list_to_integer(dataset["cast"])
# crew_mapping = map_list_to_integer(dataset["crew"])
# dataset["cast"] = dataset.apply(lambda x: np.asarray([cast_mapping[i] for i in x["cast"]]) , axis=1)

In [7]:
# dataset["cast"] = dataset.apply(lambda x: np.asarray(x["cast"]) , axis=1)

In [8]:
# cast_names = cast_mapping.keys()
# cast_names
# for name in cast_names:
#     dataset['cast_name_' + name] = dataset['cast'].apply(lambda x: 1 if name in x else 0)

In [9]:
json_cols = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'Keywords','crew']

def get_dictionary(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d

for col in json_cols + ['belongs_to_collection'] :
    dataset[col] = dataset[col].apply(lambda x : get_dictionary(x))
    
dataset["director"] = dataset["crew"].apply(lambda x: next(iter([y["name"] for y in x if y["job"] == "Director"]), None))
dataset = pd.get_dummies(dataset,prefix=['director'], columns = ['director'], drop_first=True)

dataset["revenue"] = dataset["revenue"].apply(lambda x: round(x, -5))
# dataset.cast.str.join('|').str.get_dummies(drop_first=True).add_prefix('actor_')

dataset = dataset.explode("cast")
dataset = pd.get_dummies(dataset,prefix=['actor'], columns = ['cast'], drop_first=True)

In [10]:
%history -g

 1/1:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 1/2:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 1/3:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 1/4:
train = pd.read_csv('./tmdb-box-office-prediction/train.csv')
train.info()
 1/5: train.head()
 1/6: train.describe(include='all')
 1/7: train.describe(include='all')
 1/8:
# Missing values 
train.isna().sum()
 1/9:
sns.jointplot(x="budget", y="revenue", data=train, height=11, ratio=4, color="g")
plt.show()
1/10:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
1/11:
sns.jointplot(x="budget", y="revenue", data=train, height=11, ratio=4, color="g")
plt.show()
1/12:
sns.jointplot(x="budget", y="revenue", data=train, height=11, ratio=4, color="b")
plt.show()
1/13:
sns.jointplot(x="budget", y="revenue", data=train, height=11, ratio=4, color="r")
plt.show()
1/14:
sns.jointplot(x="budget", y="revenue", data=train, he

grouped = dataset["id"].groupby(dataset["release_date"].dt.year).agg('count')
grouped.iloc[100]
# pca = PCA()
# X_pca = pca.fit_transform(array)
# X_pca.shape
#
91/211:
grouped = dataset["id"].groupby(dataset["release_date"].dt.year).agg('count')
grouped.iloc[50]
# pca = PCA()
# X_pca = pca.fit_transform(array)
# X_pca.shape
#
91/212:
grouped = dataset["id"].groupby(dataset["release_date"].dt.year).agg('count')
grouped.columns
# pca = PCA()
# X_pca = pca.fit_transform(array)
# X_pca.shape
#
91/213:
grouped = dataset["id"].groupby(dataset["release_date"].dt.year).agg('count')
grouped.columns()
# pca = PCA()
# X_pca = pca.fit_transform(array)
# X_pca.shape
#
91/214:
grouped = dataset[["release_date", "id"]].groupby(dataset["release_date"].dt.year).agg('count')
grouped.columns()
# pca = PCA()
# X_pca = pca.fit_transform(array)
# X_pca.shape
#
91/215:
grouped = dataset[["release_date", "id"]].groupby(dataset["release_date"].dt.year).agg('count')
grouped
# pca = PCA()
# X_pca = pca.fit_tran

sys.path.extend(['/Users/kamildoleglo/Downloads/im_lab3'])
202/2: runfile('/Users/kamildoleglo/Downloads/im_lab3/lab3_segmentation.py', wdir='/Users/kamildoleglo/Downloads/im_lab3')
203/1:
print('PyDev console: using IPython 7.13.0\n')

import sys; print('Python %s on %s' % (sys.version, sys.platform))
sys.path.extend(['/Users/kamildoleglo/Downloads/im_lab3'])
204/1:

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
204/2:

# Load the diabetes dataset
diabetes = datasets.load_diabetes()
204/3:

# Use only one feature
diabetes_X = diabetes.data[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]
204/4:

# Create linear regression object
regr = linear_model.LinearRegression()

# T

cs_clf = tree.DecisionTreeClassifier()
cs_clf = cs_clf.fit(CS_X, CS_y)
258/12:
features = ["computers", "sites"]

plt.figure(figsize=(10, 10))
tree.plot_tree(cs_clf, filled=True, feature_names=features, class_names=errors)
plt.show()
258/13:
#features_onehot = ["C1", "C2", "C3", "C4", "WWW1", "WWW2", "WWW3", "WWW4"]

features_onehot = computers + sites

CS_X_onehot=[]

for (comp_id, server_id) in CS_X:
    vector = [0 for _ in range(len(features_onehot))]
    vector[comp_id] = 1
    vector[len(computers) + server_id] = 1
    CS_X_onehot.append(vector)
258/14: print(CS_X_onehot)
258/15:
cs_clf_onehot = tree.DecisionTreeClassifier(max_depth=2, min_samples_leaf=5)
cs_clf_onehot = cs_clf_onehot.fit(CS_X_onehot, CS_y)
258/16:
features_onehot = computers + sites

plt.figure(figsize=(10, 10))
tree.plot_tree(cs_clf_onehot, filled=True, feature_names=features_onehot, class_names=errors)
plt.show()
259/1:
from sklearn.datasets import load_iris
from sklearn import tree
import numpy as np

%matplo

274/25:
plt.figure(figsize=(20,12))
sns.countplot(dataset_oscars['release_date'].dt.year.sort_values())
plt.title("13. Oscar movie releases by year",fontsize=20)
plt.xticks(fontsize=12,rotation=90)
plt.xlabel("year")
plt.show()
274/26:
plt.figure(figsize=(20,12))
sns.countplot(dataset_oscars['release_date'].dt.weekday.sort_values())
plt.title("14. Oscar movie releases by weekday",fontsize=20)
loc, _ = plt.xticks()
loc, labels = loc, ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"]
plt.xticks(loc, labels,fontsize=20)
plt.xlabel("weekday")
plt.show()
274/27:
plt.figure(figsize=(20,12))
sns.countplot(dataset_oscars['release_date'].dt.month.sort_values())
plt.title("15. Oscar movie releases by month",fontsize=20)
loc, _ = plt.xticks()
loc, labels = loc, ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
plt.xticks(loc, labels,fontsize=20)
plt.xlabel("month")
plt.show()
274/28:
plt.figure(figsize=(20,12))
dataset_oscars.groupby(dataset_oscars["release_date"].dt.

301/24:
plt.figure(figsize=(20,12))
sns.countplot(dataset_oscars['release_date'].dt.month.sort_values())
plt.title("15. Oscar movie releases by month",fontsize=20)
loc, _ = plt.xticks()
loc, labels = loc, ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
plt.xticks(loc, labels,fontsize=20)
plt.xlabel("month")
plt.show()
301/25:
plt.figure(figsize=(20,12))
dataset_oscars.groupby(dataset_oscars["release_date"].dt.month).agg('mean')['revenue'].plot(kind='bar',rot=0)
plt.ylabel('Mean revenue (100 million dollars)')
plt.title("16. Mean revenue by month for oscar movies")
loc, _ = plt.xticks()
loc, labels = loc, ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
plt.xticks(loc, labels,fontsize=20)
plt.xlabel("month")
plt.show()
301/26:
plt.figure(figsize=(20,12))
genre=dataset_oscars['genres'].apply(lambda x: [i['name'] for i in x] if x != {} else [])
count=Counter([i for j in genre for i in j]).most_common(20)
sns.barplot(

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split

X_train = dataset.filter(regex=("director_.*|budget|actor_.*")).to_numpy() #aktorzy, reżyser, budżet
y_train = dataset[["revenue"]].to_numpy()

# pd.get_dummies(dataset["cast"],prefix='cast',drop_first=True)
X_train
depth_range = range(1, 15)
best_depth = 1
max_score = 0
for depth in depth_range:
    scores = []
    classifier = DecisionTreeRegressor(max_depth = depth)
    cv = KFold(n_splits=10)
    for train_index, test_index in cv.split(X_train):
        xtrain, xtest, ytrain, ytest = X_train[train_index], X_train[test_index], y_train[train_index], y_train[test_index]
        classifier.fit(xtrain, ytrain)
        scores.append(classifier.score(xtest, ytest))
    print("Score: ", np.mean(scores), "depth: ", depth)
    if np.mean(scores) > max_score:
        max_score = np.mean(scores)
        best_depth = depth
print("Max score: ", max_score, " for depth: ", best_depth)