## Run functions in the supplement before running this function

In [None]:
###### Function to generate train and test data for classification models #######
### ------ Input ---------- ###
# feature_data: a data frame with original features from IMDB and TMDB
# genre_data: a data grame with genres (already merged turned into one-hot coding), 
#             matching the feature_data by 'imdb_ids'
# n_sample: the number of data points to sample from the input data frame to fit model
# train_ratio: the percentage of training data among sampled data
# random_state: seed for random sampling


def generate_train_test_data(feature_data, genre_data, n_sample = 5000, train_ratio = 0.5, random_state = 0):
    
    import pandas as pd
    import numpy as np
    import re
    import random
    import warnings
    warnings.filterwarnings("ignore")

    # ------------------------------------------------------------------------------------------------------ #     

    ### sample data ###
    random.seed(random_state)
    index = random.sample(range(0, feature_data.shape[0]), n_sample)
    
    data_x = feature_data.iloc[index,]
    data_y = genre_data.iloc[index,]
    
    ### split into train and test data ###
    x_train = data_x.iloc[0:int(n_sample * train_ratio), ]
    x_test = data_x.iloc[nt(n_sample * train_ratio):, ]
    y_train = data_y.iloc[0:int(n_sample * train_ratio), ]
    y_test = data_y.iloc[nt(n_sample * train_ratio):, ]
    
    
    # ------------------------------------------------------------------------------------------------------ # 
    ### Variable Group 1: ['imdb_ids', 'year', 'rating', 'votes', 'popularity_TMDB', 'runtime_TMDB'] ###

    val_group1 = ['year', 'rating', 'votes', 'popularity_TMDB', 'runtime_TMDB']
    
    ## missing value imputation ##
    # use the mean from train data to fill test data 

    # 'year', 'rating', 'votes' have missing values as 'NaN', fill by column mean
    x_train.ix[x_train['year'].isnull(), 'year'] = x_train['year'].mean()
    x_test.ix[x_test['year'].isnull(), 'year'] = x_train['year'].mean()

    x_train.ix[x_train['rating'].isnull(), 'rating'] = x_train['rating'].mean()
    x_test.ix[x_test['rating'].isnull(), 'rating'] = x_train['rating'].mean()

    x_train.ix[x_train['votes'].isnull(), 'votes'] = x_train['votes'].mean()
    x_test.ix[x_test['votes'].isnull(), 'votes'] = x_train['votes'].mean()

    # 'popularity_TMDB', 'runtime_TMDB' have missing values as 0, replace by column mean
    x_train.ix[x_train['popularity_TMDB'] == 0, 'popularity_TMDB'] = x_train['popularity_TMDB'].mean()
    x_test.ix[x_test['runtime_TMDB'] == 0, 'runtime_TMDB'] = x_train['runtime_TMDB'].mean() 
    
    ## use x_train_new, x_test_new to record the transformed data
    x_train_group1 = x_train[val_group1]
    x_test_group1 = x_test[val_group1]

    
    # ------------------------------------------------------------------------------------------------------ # 
    ### Variables Group 2: ['title', 'plot', 'plot outline', 'overview_TMDB', 'tagline_TMDB', 'mpaa_reason'] ###

    ## combine the text in ['title', 'plot', 'plot outline', 'overview_TMDB', 'tagline_TMDB']

    x_train_text = x_train['title'].str.cat([x_train['plot'], x_train['plot outline'], 
                              x_train['overview_TMDB'], x_train['tagline_TMDB']], 
                              na_rep = " ")

    x_test_text = x_test['title'].str.cat([x_test['plot'], x_train['plot outline'], 
                              x_test['overview_TMDB'], x_test['tagline_TMDB']], 
                              na_rep = " ")

    ## apply text analysis on combined text and return the top 10 PCs
    x_train_group2, x_test_group2 = text_analysis(x_train_text, x_test_text,
                                             val_name = 'text', n_components = 10)

    
    # ------------------------------------------------------------------------------------------------------ # 
    ### Variables Group 3: [mpaa_reason'] ###

    ## apply text analysis on 'mpaa_reason' and return thr PCs that cover 60% variance
    x_train_group3, x_test_group3 = text_analysis(x_train['mpaa_reason'], x_test['mpaa_reason'],
                                             val_name = 'mpaa', n_components = 10)
    
    
    # ------------------------------------------------------------------------------------------------------ # 
    ### Variables Group 4: ['director', 'cast', 'production company', 'writer'] ###
    
    # return x_train_group4, x_test_group4
    
    
    # ------------------------------------------------------------------------------------------------------ # 
    ### Variables Group 5: ['animation department', 'original music'] ###
    
    # return x_train_group5, x_test_group5
    
    
    
    
    # ------------------------------------------------------------------------------------------------------ # 
    ### Combine Engineered Features ###
    x_train_new = pd.concat([x_train_group1, x_train_group2, x_train_group3, 
                             x_train_group4, x_train_group5], axis=1)
    
    x_test_new = pd.concat([x_test_group1, x_test_group2, x_test_group3, 
                             x_test_group4, x_test_group5], axis=1)
    
    return (x_train_new, x_test_new, y_train, y_test)

## Supplement: functions that are used in the above meta-function:

In [21]:
##### part of the function to apply text analysis to a data series #####
#### to transform a text paragraph to bag-of-words than to one-hot coding


#----------------------------------------------------------------------------------------------------
### Input ###
        # data: a series for text analysis

        
### Output ###
        # the transformed data in one-hot coding
#----------------------------------------------------------------------------------------------------        



def text_to_matrix(data):
    
    import pandas as pd
    import numpy as np
    import re
    import warnings
    warnings.filterwarnings("ignore")
    
    
    
    ## --------------- Bag-of-Words --------------- ##
    
    ## string to list
    import re
    col_words = []
    
    for i in range(len(data)):
    
        if type(data.values[i]) == str: 
            letters_only = re.sub("[^a-zA-Z]", " " , data.values[i]) # remove non-letter
            lower_case = letters_only.lower().split()   # Convert to lower case # Split into words
            
            # avoid downloading nltk
            # from NLTK stopwords https://pythonprogramming.net/stop-words-nltk-tutorial/
            stops = {'ourselves', 'hers', 'between', 'yourself', 'but', 'again', 'there', 'about', 'once', 
                     'during', 'out', 'very', 'having', 'with', 'they', 'own', 'an', 'be', 'some', 'for', 
                     'do', 'its', 'yours', 'such', 'into', 'of', 'most', 'itself', 'other', 'off', 'is', 
                     's', 'am', 'or', 'who', 'as', 'from', 'him', 'each', 'the', 'themselves', 'until', 
                     'below', 'are', 'we', 'these', 'your', 'his', 'through', 'don', 'nor', 'me', 'were', 
                     'her', 'more', 'himself', 'this', 'down', 'should', 'our', 'their', 'while', 'above', 
                     'both', 'up', 'to', 'ours', 'had', 'she', 'all', 'no', 'when', 'at', 'any', 'before', 
                     'them', 'same', 'and', 'been', 'have', 'in', 'will', 'on', 'does', 'yourselves', 'then', 
                     'that', 'because', 'what', 'over', 'why', 'so', 'can', 'did', 'not', 'now', 'under', 'he', 
                     'you', 'herself', 'has', 'just', 'where', 'too', 'only', 'myself', 'which', 'those', 'i', 
                     'after', 'few', 'whom', 't', 'being', 'if', 'theirs', 'my', 'against', 'a', 'by', 'doing', 
                     'it', 'how', 'further', 'was', 'here', 'than'} 
            meaningful_words = [w for w in lower_case if not w in stops]  # Remove stop words from "words"
            
            words = ( " ".join(meaningful_words))
    
        else: words = "NA"
       
        col_words.append(words)
        
        
    
    ## list to vector
    from sklearn.feature_extraction.text import CountVectorizer

    # Initialize the "CountVectorizer" object
    vectorizer = CountVectorizer(analyzer = "word",   
                                 tokenizer = None,    
                                 preprocessor = None, 
                                 stop_words = None,   
                                 max_features = 50000)

    data_array = vectorizer.fit_transform(col_words)
    data_array = pd.DataFrame(data_array.toarray())
    data_array.columns = vectorizer.get_feature_names()
    
    return data_array
    

In [22]:
##### Function to apply text analysis to a column with `colname` in the data file `filename`


#----------------------------------------------------------------------------------------------------
### Input ###
        # train: the train data in one-hot coding
        # test: the test data in one-hot coding
        # val_name:  variable name that's used in naming the columns as "val_name_PCi"
        # n_components: if value is int, the number of PCs to return
                       # if value between (0,1), the variance explained by the PCs returnd
        
### Output ###
        # data matrix of engineered feature, one for train data and one for test data
#----------------------------------------------------------------------------------------------------        


def text_analysis(train, test, val_name, n_components):
    
    ## turn each text paragraph into one-hot coding
    train = text_to_matrix(train)
    test = text_to_matrix(test)
    
    ## take the union set of words in train and text data as column
    ## words that don't show up are assigned 0
    align_column = pd.concat([train_text,test_text], axis=0).fillna(0)
    
    ## split into train and text after aligning the columns
    train = align_column.iloc[0:train.shape[0], ]
    test = align_column.iloc[train.shape[0]:, ]
    
    ## PCA
    from sklearn.decomposition import PCA
    pca = PCA(n_components = n_components, svd_solver = "full")
    pca.fit(train)
    train_new = pd.DataFrame(pca.transform(train))
    test_new = pd.DataFrame(pca.transform(test))
    
    col_names = []
    for i in range(train_new[1]):
        i_name = val_name + "_PC" + str(i+1)
        col_names.append(i_name)
        
    train_new.columns = col_names
    test_new.columns = col_names
    
    return (train_new, test_new)
    