In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import random
from plotly import tools
import plotly_express as px
from plotly.offline import init_notebook_mode,iplot,plot
import plotly.figure_factory as ff
import plotly.graph_objs as go
import ast

In [3]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [4]:
print(df_train.shape)
print(df_test.shape)

(3000, 23)
(4398, 22)


In [5]:
df_train.head(1).transpose()

Unnamed: 0,0
id,1
belongs_to_collection,"[{'id': 313576, 'name': 'Hot Tub Time Machine ..."
budget,14000000
genres,"[{'id': 35, 'name': 'Comedy'}]"
homepage,
imdb_id,tt2637294
original_language,en
original_title,Hot Tub Time Machine 2
overview,"When Lou, who has become the ""father of the In..."
popularity,6.575393


In [6]:
print(df_train.head(1).Keywords.values)

["[{'id': 4379, 'name': 'time travel'}, {'id': 9663, 'name': 'sequel'}, {'id': 11830, 'name': 'hot tub'}, {'id': 179431, 'name': 'duringcreditsstinger'}]"]


In [7]:
def expand_json(json_str):
    d = ast.literal_eval(json_str)
    return [item["name"] for item in d]
json_features = ["belongs_to_collection","genres","production_companies","production_countries","Keywords","spoken_languages","cast","crew"]
for feature in json_features:
    df_train.loc[df_train[feature].notnull(),feature] = df_train.loc[df_train[feature].notnull(),feature].apply(expand_json)
    df_test.loc[df_test[feature].notnull(),feature] = df_test.loc[df_test[feature].notnull(),feature].apply(expand_json)

In [8]:
print(df_train.head(1).Keywords.values)

[list(['time travel', 'sequel', 'hot tub', 'duringcreditsstinger'])]


In [9]:
df_train.isnull().sum()

id                          0
belongs_to_collection    2396
budget                      0
genres                      7
homepage                 2054
imdb_id                     0
original_language           0
original_title              0
overview                    8
popularity                  0
poster_path                 1
production_companies      156
production_countries       55
release_date                0
runtime                     2
spoken_languages           20
status                      0
tagline                   597
title                       0
Keywords                  276
cast                       13
crew                       16
revenue                     0
dtype: int64

In [11]:
df_train["has_homepage"] = df_train["homepage"].isna().apply(lambda x:0 if x else 1)
df_train["is_belongs_to_collection"] = df_train["belongs_to_collection"].isna().apply(lambda x:0 if x else 1)
df_train["has_tagline"] = df_train["tagline"].isna().apply(lambda x:0 if x else 1)
df_test["has_homepage"] = df_test["homepage"].isna().apply(lambda x:0 if x else 1)
df_test["is_belongs_to_collection"] = df_test["belongs_to_collection"].isna().apply(lambda x:0 if x else 1)
df_test["has_tagline"] = df_test["tagline"].isna().apply(lambda x:0 if x else 1)

In [12]:
df_train[["has_homepage","is_belongs_to_collection","homepage","belongs_to_collection"]].head(5)

Unnamed: 0,has_homepage,is_belongs_to_collection,homepage,belongs_to_collection
0,1,1,0,[Hot Tub Time Machine Collection]
1,1,1,0,[The Princess Diaries Collection]
2,1,0,1,
3,1,0,1,
4,1,0,0,


In [13]:
df_train[["has_homepage","is_belongs_to_collection","homepage","belongs_to_collection"]].head(5)

Unnamed: 0,has_homepage,is_belongs_to_collection,homepage,belongs_to_collection
0,1,1,0,[Hot Tub Time Machine Collection]
1,1,1,0,[The Princess Diaries Collection]
2,1,0,1,
3,1,0,1,
4,1,0,0,


In [14]:
df_train = df_train.drop(columns=["belongs_to_collection","tagline","homepage"])
df_test = df_test.drop(columns=["belongs_to_collection","tagline","homepage"])

In [15]:
json_features.remove("belongs_to_collection")
for item in json_features:
    df_train[item] = df_train[item].apply(lambda x: x if isinstance(x,list) else [])
    df_test[item] = df_test[item].apply(lambda x: x if isinstance(x,list) else [])

In [16]:
df_train["overview"] = df_train["overview"].fillna("")
df_train = df_train.dropna()
df_test["overview"] = df_test["overview"].fillna("")
df_test = df_test.dropna()

In [17]:
df_train.isna().sum()

id                          0
budget                      0
genres                      0
imdb_id                     0
original_language           0
original_title              0
overview                    0
popularity                  0
poster_path                 0
production_companies        0
production_countries        0
release_date                0
runtime                     0
spoken_languages            0
status                      0
title                       0
Keywords                    0
cast                        0
crew                        0
revenue                     0
has_homepage                0
is_belongs_to_collection    0
has_tagline                 0
dtype: int64

In [18]:
px.scatter(df_train,x="budget",y="revenue")


In [19]:
px.scatter(df_train,x="runtime",y="revenue")

In [20]:
px.scatter(df_train,x="popularity",y="revenue")

In [21]:
df_train["genres_list_length"] = df_train["genres"].apply(len)

In [22]:
px.scatter(df_train,x="genres_list_length",y="revenue")

In [35]:
df_train["year"] = pd.to_datetime(df_train["release_date"]).dt.year
df_train = df_train[df_train["year"] <=2015]
df_year_group = df_train[["revenue","year"]].groupby("year").sum()
px.line(df_year_group)

In [36]:
for item in json_features:
    df_train[item +"_count"] = df_train[item].apply(len)
    df_test[item +"_count"] = df_test[item].apply(len)
df_train.columns

Index(['id', 'budget', 'genres', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'production_countries', 'release_date',
       'runtime', 'spoken_languages', 'status', 'title', 'Keywords', 'cast',
       'crew', 'revenue', 'has_homepage', 'is_belongs_to_collection',
       'has_tagline', 'genres_list_length', 'year', 'genres_count',
       'production_companies_count', 'production_countries_count',
       'Keywords_count', 'spoken_languages_count', 'cast_count', 'crew_count'],
      dtype='object')

In [37]:
string_features = ["original_title","overview","title"]
for item in string_features:
    df_train[item+"_len"]= df_train[item].apply(len)
    df_test[item+"_len"]= df_test[item].apply(len)
df_train.columns

Index(['id', 'budget', 'genres', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'production_countries', 'release_date',
       'runtime', 'spoken_languages', 'status', 'title', 'Keywords', 'cast',
       'crew', 'revenue', 'has_homepage', 'is_belongs_to_collection',
       'has_tagline', 'genres_list_length', 'year', 'genres_count',
       'production_companies_count', 'production_countries_count',
       'Keywords_count', 'spoken_languages_count', 'cast_count', 'crew_count',
       'original_title_len', 'overview_len', 'title_len'],
      dtype='object')

In [39]:
date_obj = pd.to_datetime(df_train.release_date)
df_train["year"] = date_obj.dt.year
df_train["month"] = date_obj.dt.month 
df_train["day"] = date_obj.dt.day
df_train["dayofweek"] = date_obj.dt.dayofweek
date_obj = pd.to_datetime(df_test.release_date)
df_test["year"] = date_obj.dt.year
df_test["month"] = date_obj.dt.month 
df_test["day"] = date_obj.dt.day
df_test["dayofweek"] = date_obj.dt.dayofweek

In [40]:
# feature_list = ["budget","popularity","runtime","has_homepage","is_belongs_to_collection", "has_tagline",  'genres_count', 'production_companies_count','production_countries_count', 'Keywords_count','spoken_languages_count', 'cast_count', 'crew_count','original_title_len', 'overview_len', 'title_len', "year","month","day", "dayofweek"]
feature_list=["budget","popularity","runtime","has_homepage","is_belongs_to_collection","has_tagline",'genres_count','production_companies_count','production_countries_count','Keywords_count','spoken_languages_count','cast_count','crew_count','original_title_len','overview_len','title_len',"year","month","day","dayofweek"]
df_train_feature = df_train[feature_list]
df_train_feature.columns
df_test_feature = df_test[feature_list]

In [41]:
df_train_target = df_train["revenue"]
df_train_target

0        12314651
1        95149435
2        13092000
3        16000000
4         3923970
          ...    
2995      1596687
2996       180590
2997     89456761
2998    171963386
2999     82087155
Name: revenue, Length: 2673, dtype: int64

In [42]:
from xgboost import XGBRegressor
xgb = XGBRegressor(n_estimators = 2000,random_state=0,max_depth=27)
xgb.fit(df_train_feature,df_train_target)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=27,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=2000, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [45]:
from sklearn.metrics import mean_squared_error
pred_target = xgb.predict(df_train_feature)
print("模拟均方误差：",mean_squared_error(pred_target,df_train_target))

模拟均方误差： 266.5857798448915


In [48]:
df_test["pred_revenue"] =xgb.predict(df_test_feature)

In [49]:
df_test[["title","pred_revenue"]].head(5)

Unnamed: 0,title,pred_revenue
0,Pokémon: The Rise of Darkrai,23000940.0
1,Attack of the 50 Foot Woman,652030.4
2,Addicted to Love,9920040.0
3,Incendies,16204590.0
4,Inside Deep Throat,1795178.0


对于列很多的数据，可以使用 df.head(1).transpose 来完成查看一条记录的内容；

JSON 数据，一种 python 字典的字符串表示，也可以表示字典列表；

使用 ast.literal_eval 方法可以将 json 字符串转换为 Python 的字典或者列表；

可以使用 df.isna().apply(lambda x: 0 if x else 1) 来将有缺失值的列改为 0 或者 1 取值的数据列，用于模型使用；

对于内容很复杂的字段，可以考虑使用简单的数值属性来建立模型，比如我们今天的列表字段，我们就使用了它的元素个数作为特征来建立模型。