In [1]:
import findspark
import re
findspark.init('/home/cse587/spark-2.4.0-bin-hadoop2.7')
import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
import pandas as pd
from nltk.corpus import stopwords
import nltk 
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder,StringIndexer, VectorAssembler
import numpy as np
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD, SVMModel
from pyspark.mllib.linalg import Vector as MllibVector, Vectors as MLLibVectors
from pyspark.ml.feature import HashingTF, IDF

In [2]:
sc = pyspark.SparkContext()

In [3]:
spark = SparkSession \
    .builder \
    .appName("PA3") \
    .config("spark.some.config.option","some-value") \
    .getOrCreate()

In [4]:
#read in data
df = pd.read_csv("train.csv")
df = spark.createDataFrame(df)

In [5]:
dft = pd.read_csv("test.csv")
dft = spark.createDataFrame(dft)

In [6]:
#remove special characters from 'plot'
df =df.withColumn('plot',lower(regexp_replace(df['plot'],"[^a-zA-Z ]","")))

In [7]:
dft =dft.withColumn('plot',lower(regexp_replace(dft['plot'],"[^a-zA-Z ]","")))

In [8]:
#tokenize 'plot'
regexToken = RegexTokenizer(inputCol='plot',outputCol='tokens', pattern=" ")

In [9]:
df = regexToken.transform(df)

In [10]:
dft = regexToken.transform(dft)

In [11]:
nltk.download('stopwords')
#remove stopwords from 'plot'
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/cse587/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
stopWordsRemover = StopWordsRemover(inputCol = 'tokens',outputCol = 'cleaned').setStopWords(stop)

In [13]:
df = stopWordsRemover.transform(df)

In [14]:
dft = stopWordsRemover.transform(dft)

In [15]:
#create term vectors for TDMs
counts = CountVectorizer(inputCol= 'cleaned',outputCol= 'features',minDF=10)

In [16]:
model = counts.fit(df)

In [17]:
df = model.transform(df)

In [18]:
dft = model.transform(dft)

In [19]:
#drop unwanted columns
df = df.drop('plot','tokens')

In [20]:
dft = dft.drop('plot','tokens', 'movie_name')

In [21]:
#get labels
labels = df.select('genre')

In [22]:
labels= labels.rdd.flatMap(lambda x:x)

In [23]:
genre_names = ['Drama','Comedy','Romance Film','Thriller','Action','World cinema','Crime Fiction','Horror','Black-and-white','Indie','Action/Adventure','Adventure','Family Film','Short Film','Romantic drama','Animation','Musical','Science Fiction','Mystery','Romantic comedy']


In [24]:
#udf to check if genre in genre names is in the moviesgenre labels returns 1 if it is 0 otherwise
def checkGenre(x):
    if g in x:
        return 1
    else:
        return 0

for g in genre_names:
    checkGenre1 = udf(checkGenre)
    df = df.withColumn(g,checkGenre1('genre').cast('double'))

In [25]:
#fit models 1 for each genre
models = []
i=1
for g in genre_names:
    reg = LogisticRegression(regParam=0.1, featuresCol='features', labelCol=g)
    model = reg.fit(df)
    models.append(model)
    print(g, 'trained', i/len(genre_names))
    i+=1

Drama trained 0.05
Comedy trained 0.1
Romance Film trained 0.15
Thriller trained 0.2
Action trained 0.25
World cinema trained 0.3
Crime Fiction trained 0.35
Horror trained 0.4
Black-and-white trained 0.45
Indie trained 0.5
Action/Adventure trained 0.55
Adventure trained 0.6
Family Film trained 0.65
Short Film trained 0.7
Romantic drama trained 0.75
Animation trained 0.8
Musical trained 0.85
Science Fiction trained 0.9
Mystery trained 0.95
Romantic comedy trained 1.0


In [26]:
#make predictions on test data
predictions = []
for model in models:
    preds = model.transform(dft)
    predictions.append(preds)

In [27]:
#rename columns for specific genre
for i in range(len(predictions)):
    predictions[i] = predictions[i].withColumnRenamed('prediction', genre_names[i])

In [28]:
#convert predictiions to a string
def dfColToString(x):
    x = int(x)
    return str(x)

for i in range(len(predictions)):
    g = genre_names[i]
    checkGenre1 = udf(dfColToString)
    predictions[i] = predictions[i].withColumn(g, checkGenre1(g))

In [29]:
#drop unwanted cols for submission
for i in range(len(predictions)):
    predictions[i] =  predictions[i].drop('movie_name', 'features', 'rawPrediction','probability', 'cleaned')

In [30]:
#join all dfs of predictions
final = predictions[0]
for i in range(len(predictions))[1:]:
    toAdd = predictions[i]
    final = final.join(toAdd, on=['movie_id'], how='inner')

In [31]:
#combine predictions into 1 col str 1s and 0s
final = final.withColumn('predictions', concat(col('Drama'),lit(' '), col('Comedy'),lit(' '), col('Romance Film'),lit(' '), col('Thriller'),lit(' '), col('Action'),lit(' '), col('World cinema'),lit(' '), col('Crime Fiction'),lit(' '), col('Horror'),lit(' '), col('Black-and-White'),lit(' '), col('Indie'),lit(' '), col('Action/Adventure'),lit(' '), col('Action'),lit(' '), col('Family Film'),lit(' '), col('Short Film'),lit(' '), col('Romantic drama'),lit(' '), col('Animation'),lit(' '), col('Musical'),lit(' '), col('Science Fiction'),lit(' '), col('Mystery'),lit(' '), col('Romantic comedy') ))
    

In [32]:
for g in genre_names:
    final = final.drop(g)

In [33]:
#convert to csv
final.toPandas().to_csv('part1predictions.csv', index=False)

# F1 Score part1 = 0.92614
