In [14]:
import numpy as np
import pandas as pd
from math import trunc
import pickle
import copy
import random
import xgboost as xgb
from sklearn import metrics
from sklearn import model_selection
import os

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

### Helper Functions

In [38]:
#load and save the feature/label dataframes and save into a dictionary
def loadFeaturesLabels():
    
    #read all the feature-label csvs
    #read in files and sort into dictionary
    #each element of dict is a movie that contains a list of featurelabel screenings
    files = np.sort(os.listdir("disk/Features & Label Csvs/"))
    prevMovieName = files[1][:files[1].find("_")]
    screeningDict = dict()
    screenings = list()

    for file in files:

        if file == '.ipynb_checkpoints':
            continue 

        movieName = file[:file.find("_")]
        if movieName != prevMovieName:
            screeningDict[prevMovieName] = screenings
            screenings = list()
            prevMovieName = movieName
        else:
            infoDf = pd.read_csv("disk/Features & Label Csvs/" + file)
            infoDf.drop("Unnamed: 0", axis=1, inplace=True)
            screenings.append(infoDf)
            prevMovieName = movieName
    
    screeningDict[movieName] = screenings
        
    return screeningDict

#singular features 
def assembleSingularFeatureLabelDf(screeningDict, movieList):
    #assemble entire feature label dataframe 
    infoDf = pd.DataFrame([])
    for movie in movieList:
        #access movie 
        screeningList = screeningDict[movie]
        for screening in screeningList:
            infoDf = pd.concat([infoDf,screening], axis=0, ignore_index=True)
            
    return infoDf

def assembleWindowFeaturesDf(screeningDict, movieList):
    

def vocWindow(screeningList):
    
    modifiedScreenings = list()
    #assemble the windowed dataframes
    for screeningIndex in range(0, len(screeningList)):

        screening = screeningList[screeningIndex]

        vocScreening = np.array([]) #only contains the voc values - features
        for screeningInstance in [screening.loc[index-9:index, '0'] for index in range(9,screening.shape[0])]:
            if vocScreening.shape[0] == 0:
                vocScreening = screeningInstance.values

            else:
                vocScreening = np.vstack((vocScreening,screeningInstance.values))


        #create dataframe
        vocScreening = pd.DataFrame(vocScreening)
        #drop last row
        vocScreening.drop(vocScreening.shape[0]-1, axis=0, inplace=True)
        #extract labels
        vocLabels = screening.loc[10:,'0']
        vocLabels.index = range(0, vocLabels.shape[0]) #reindex so concatenates correctly 

        #concat labels and features and create header 
        header = list(vocScreening.columns)
        header.append('VOC')
        vocScreening = pd.concat([vocScreening, vocLabels], axis=1, copy=False, ignore_index=True)
        vocScreening.columns = header

        modifiedScreenings.append(vocScreening)
        
    return modifiedScreenings

def assembleARFeatureLabelDf(screeningDict, movieList):
    #assemble entire feature label dataframe 
    infoDf = pd.DataFrame([])
    for movie in movieList:
        #access movie 
        screeningList = screeningDict[movie]
        screeningList = vocWindow(screeningList)
        for screening in screeningList:
            infoDf = pd.concat([infoDf,screening], axis=0, ignore_index=True)
            
    return infoDf


### Main Code

In [3]:
#load the screenings
screeningDict = loadFeaturesLabels()

In [36]:
R2_score = list()
RMSE_score = list()
testingMovies = list()
Random_R2_score = list()
Random_RMSE_score = list()

movieRuntimesPath = 'Numerical Data/movie_runtimes.csv'
movieRuntimeDf = pd.read_csv(movieRuntimesPath, usecols = ['movie', 'runtime (mins)', 'effective runtime'])
movieList = list(movieRuntimeDf['movie'])

for movie in movieList: 
    
    #train test split 
    trainingMovies = list(movieRuntimeDf['movie'])
    testMovie = [movie]
    trainingMovies.pop(trainingMovies.index(testMovie[0]))
    
    #assemble the training and test feature label dataframes
    trainingDf = assembleWindowFeaturesDf(screeningDict,trainingMovies)
    break
#     testingDf = assembleSingularFeatureLabelDf(screeningDict,testMovie)

#     #split into labels and features 
#     trainingLabels = trainingDf["0"].values
#     trainingFeatures = trainingDf.drop("0", axis=1).values
#     testingLabels = testingDf["0"].values
#     testingFeatures = testingDf.drop("0", axis=1).values
    
#     #normal 
#     regressor = xgb.XGBRegressor(n_estimators=1000, n_jobs=-1)
#     regressor.fit(trainingFeatures, trainingLabels.ravel())

#     predictions = regressor.predict(testingFeatures)
#     r2_score = metrics.r2_score(testingLabels, predictions)
#     rmse = np.sqrt(metrics.mean_squared_error(testingLabels,predictions))
    
#     print("R2 score:", r2_score)
#     print("RMSE: ", rmse)
    
#     R2_score.append(r2_score)
#     RMSE_score.append(rmse)
#     testingMovies.append(movie)

    

In [37]:
trainingDf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,VOC
0,0.001132,0.000000,0.005582,0.013024,0.013024,0.018528,0.025004,0.026246,0.031160,0.041892,0.052950
1,0.000000,0.005582,0.013024,0.013024,0.018528,0.025004,0.026246,0.031160,0.041892,0.052950,0.071716
2,0.005582,0.013024,0.013024,0.018528,0.025004,0.026246,0.031160,0.041892,0.052950,0.071716,0.090050
3,0.013024,0.013024,0.018528,0.025004,0.026246,0.031160,0.041892,0.052950,0.071716,0.090050,0.099900
4,0.013024,0.018528,0.025004,0.026246,0.031160,0.041892,0.052950,0.071716,0.090050,0.099900,0.117047
...,...,...,...,...,...,...,...,...,...,...,...
78620,0.960048,0.955568,0.951804,0.950886,0.951736,0.948350,0.945327,0.944491,0.943236,0.944005,0.950293
78621,0.955568,0.951804,0.950886,0.951736,0.948350,0.945327,0.944491,0.943236,0.944005,0.950293,0.956540
78622,0.951804,0.950886,0.951736,0.948350,0.945327,0.944491,0.943236,0.944005,0.950293,0.956540,0.955798
78623,0.950886,0.951736,0.948350,0.945327,0.944491,0.943236,0.944005,0.950293,0.956540,0.955798,0.948876


In [10]:
#write to dataframe
pd.DataFrame({'RMSE':RMSE_score, 'R2 Score':R2_score, 'Test Movie': testingMovies}).to_csv("XGBoost No Audio Results.csv")