In [0]:
# !apt-get install openjdk-8-jdk-headless -qq > /dev/null
# !wget -q http://us.mirrors.quenda.co/apache/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
# !tar xf spark-2.4.5-bin-hadoop2.7.tgz
# !pip install -q findspark

# import os
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# import findspark
# findspark.init()


import findspark
findspark.init('/home/cse587/spark/spark-2.4.0-bin-hadoop2.7')

In [0]:
from pyspark.sql import *
import pyspark.sql.functions as f
from pyspark.sql.types import FloatType, DoubleType, IntegerType
from pyspark.ml.feature import CountVectorizer,  CountVectorizerModel, StopWordsRemover, OneHotEncoder, StringIndexer
from pyspark.ml.classification import LogisticRegression, OneVsRest
# from pyspark.mllib.regression import LabeledPoint
# from pyspark.mllib.classification import LogisticRegressionWithLBFGS

In [0]:
spark = SparkSession.builder.appName('DicAssign3').config("spark.some.config.option","some-value").getOrCreate()

In [0]:
df = spark.read.csv("train.csv",inferSchema="true",header="true",escape='"')
df_test = spark.read.csv("test.csv",inferSchema="true",header="true",escape='"')
df = df.na.drop(subset=["genre"])
mappings = spark.read.csv("mapping.csv",header="true")

In [6]:
df.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)



In [7]:
df.filter(df.genre.isNull()).show()

+--------+----------+----+-----+
|movie_id|movie_name|plot|genre|
+--------+----------+----+-----+
+--------+----------+----+-----+



In [8]:
mappings.show()
mappings.printSchema()

+---+----------------+
|_c0|               0|
+---+----------------+
|  0|           Drama|
|  1|          Comedy|
|  2|    Romance Film|
|  3|        Thriller|
|  4|          Action|
|  5|    World cinema|
|  6|   Crime Fiction|
|  7|          Horror|
|  8| Black-and-white|
|  9|           Indie|
| 10|Action/Adventure|
| 11|       Adventure|
| 12|     Family Film|
| 13|      Short Film|
| 14|  Romantic drama|
| 15|       Animation|
| 16|         Musical|
| 17| Science Fiction|
| 18|         Mystery|
| 19| Romantic comedy|
+---+----------------+

root
 |-- _c0: string (nullable = true)
 |-- 0: string (nullable = true)



In [0]:
mappings = mappings.withColumn("genre_array", f.split(f.col("0"), ",\s*"))

In [10]:
mappings = mappings.drop("0")
mappings.show()
mappings.printSchema()
mapping_list = ["Drama","Comedy","Romance Film","Thriller","Action","World cinema","Crime Fiction","Horror","Black-and-white","Indie","Action/Adventure","Adventure","Family Film","Short Film","Romantic drama","Animation","Musical","Science Fiction","Mystery","Romantic comedy"]

+---+------------------+
|_c0|       genre_array|
+---+------------------+
|  0|           [Drama]|
|  1|          [Comedy]|
|  2|    [Romance Film]|
|  3|        [Thriller]|
|  4|          [Action]|
|  5|    [World cinema]|
|  6|   [Crime Fiction]|
|  7|          [Horror]|
|  8| [Black-and-white]|
|  9|           [Indie]|
| 10|[Action/Adventure]|
| 11|       [Adventure]|
| 12|     [Family Film]|
| 13|      [Short Film]|
| 14|  [Romantic drama]|
| 15|       [Animation]|
| 16|         [Musical]|
| 17| [Science Fiction]|
| 18|         [Mystery]|
| 19| [Romantic comedy]|
+---+------------------+

root
 |-- _c0: string (nullable = true)
 |-- genre_array: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
df1 = df.withColumn("genre_array", f.regexp_replace(f.col("genre"), r'[\[\]\']', ''))

In [12]:
df.select("genre").show(truncate=False)

+-------------------------------------------------------------------------+
|genre                                                                    |
+-------------------------------------------------------------------------+
|['World cinema', 'Drama']                                                |
|['Action/Adventure', 'Action', 'Science Fiction', 'Drama']               |
|['Musical', 'Action', 'Drama']                                           |
|['Comedy']                                                               |
|['Crime Fiction', 'World cinema', 'Drama']                               |
|['Action/Adventure', 'Action', 'Thriller', 'Drama']                      |
|['Thriller', 'Drama', 'Horror']                                          |
|['Drama']                                                                |
|['Black-and-white', 'Comedy', 'Romance Film', 'Romantic comedy', 'Drama']|
|['Animation', 'Short Film', 'Family Film']                               |
|['Comedy'] 

In [13]:
df1.show()

+--------+--------------------+--------------------+--------------------+--------------------+
|movie_id|          movie_name|                plot|               genre|         genre_array|
+--------+--------------------+--------------------+--------------------+--------------------+
|23890098|          Taxi Blues|Shlykov, a hard-w...|['World cinema', ...| World cinema, Drama|
|31186339|    The Hunger Games|The nation of Pan...|['Action/Adventur...|Action/Adventure,...|
|20663735|          Narasimham|Poovalli Induchoo...|['Musical', 'Acti...|Musical, Action, ...|
| 2231378|  The Lemon Drop Kid|The Lemon Drop Ki...|          ['Comedy']|              Comedy|
|  595909|   A Cry in the Dark|Seventh-day Adven...|['Crime Fiction',...|Crime Fiction, Wo...|
| 5272176|            End Game|The president is ...|['Action/Adventur...|Action/Adventure,...|
| 1952976|          Dark Water|{{plot}} The film...|['Thriller', 'Dra...|Thriller, Drama, ...|
|24225279|                Sing|The story begins ..

In [0]:
df2 = df1.withColumn("genre_array", f.split(f.col("genre_array"), ", "))

In [15]:
df.filter(df.genre.isNull()).count()

0

In [0]:
# vect = CountVectorizer(inputCol="genre_array",outputCol="ohe_genre",vocabSize=20)
vect = CountVectorizerModel.from_vocabulary(vocabulary=mapping_list,inputCol="genre_array",outputCol="ohe_genre")

In [17]:
vect.vocabulary

['Drama',
 'Comedy',
 'Romance Film',
 'Thriller',
 'Action',
 'World cinema',
 'Crime Fiction',
 'Horror',
 'Black-and-white',
 'Indie',
 'Action/Adventure',
 'Adventure',
 'Family Film',
 'Short Film',
 'Romantic drama',
 'Animation',
 'Musical',
 'Science Fiction',
 'Mystery',
 'Romantic comedy']

In [0]:
mappins_ohe = vect.transform(df2)

In [19]:
mappins_ohe.select("genre_array","ohe_genre").show(truncate=False)

+---------------------------------------------------------------+---------------------------------------+
|genre_array                                                    |ohe_genre                              |
+---------------------------------------------------------------+---------------------------------------+
|[World cinema, Drama]                                          |(20,[0,5],[1.0,1.0])                   |
|[Action/Adventure, Action, Science Fiction, Drama]             |(20,[0,4,10,17],[1.0,1.0,1.0,1.0])     |
|[Musical, Action, Drama]                                       |(20,[0,4,16],[1.0,1.0,1.0])            |
|[Comedy]                                                       |(20,[1],[1.0])                         |
|[Crime Fiction, World cinema, Drama]                           |(20,[0,5,6],[1.0,1.0,1.0])             |
|[Action/Adventure, Action, Thriller, Drama]                    |(20,[0,3,4,10],[1.0,1.0,1.0,1.0])      |
|[Thriller, Drama, Horror]                    

In [0]:
# clean plot
mappins_ohe = mappins_ohe.withColumn("plot", f.regexp_replace(f.col("plot"), r'[^A-Za-z ]', ''))
mappins_ohe = mappins_ohe.withColumn("plot", f.regexp_replace(f.col("plot"), '\s+', ' '))
mappins_ohe = mappins_ohe.withColumn("plot", f.lower(f.col("plot")))

In [0]:
# clean test plot
mappins_ohe_test = df_test.withColumn("plot", f.regexp_replace(f.col("plot"), r'[^A-Za-z ]', ''))
mappins_ohe_test = mappins_ohe_test.withColumn("plot", f.regexp_replace(f.col("plot"), '\s+', ' '))
mappins_ohe_test = mappins_ohe_test.withColumn("plot", f.lower(f.col("plot")))

In [0]:
mappins_ohe1 = mappins_ohe.withColumn("plot_tokens", f.split(f.col("plot"), " "))
mappins_ohe_test1 = mappins_ohe_test.withColumn("plot_tokens", f.split(f.col("plot"), " "))

In [0]:
remover = StopWordsRemover(inputCol="plot_tokens", outputCol="filtered_plot")
filtered_df = remover.transform(mappins_ohe1)
filtered_df_test = remover.transform(mappins_ohe_test1)

In [24]:
filtered_df.withColumn("word", f.explode(f.col("filtered_plot")))\
.groupBy("word")\
.count()\
.sort("count", ascending=False)\
.show()
filtered_df_test.withColumn("word", f.explode(f.col("filtered_plot")))\
.groupBy("word")\
.count()\
.sort("count", ascending=False)\
.show()

+-------+-----+
|   word|count|
+-------+-----+
|    one|23628|
|   back|18650|
|    two|16395|
|   film|14665|
|  tells|14527|
| father|13850|
|    man|13751|
|   time|13119|
|    new|13107|
|   life|13031|
|    get|12980|
|   love|12882|
|   home|12736|
|  house|12368|
|   find|12288|
|   also|12190|
|however|12119|
|  finds|12048|
|  later|11757|
| family|11719|
+-------+-----+
only showing top 20 rows

+-------+-----+
|   word|count|
+-------+-----+
|    one| 5851|
|   back| 4617|
|    two| 4126|
|  tells| 3797|
|   film| 3559|
| father| 3492|
|    man| 3474|
|    get| 3351|
|   love| 3244|
|   time| 3228|
|   home| 3206|
|    new| 3191|
|   find| 3083|
|   also| 3059|
|however| 3046|
|   life| 3034|
|  later| 3026|
|  finds| 2951|
| family| 2883|
|  house| 2875|
+-------+-----+
only showing top 20 rows



In [0]:
plot_vectorizer = CountVectorizer(inputCol="filtered_plot",outputCol="matrix",vocabSize=10000)

In [0]:
plot_vect_model = plot_vectorizer.fit(filtered_df)

In [0]:
term_doc_matrix = plot_vect_model.transform(filtered_df)
term_doc_matrix_test = plot_vect_model.transform(filtered_df_test)

In [28]:
term_doc_matrix.printSchema()
term_doc_matrix_test.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- genre_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ohe_genre: vector (nullable = true)
 |-- plot_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- matrix: vector (nullable = true)

root
 |-- movie_id: integer (nullable = true)
 |-- movie_name: string (nullable = true)
 |-- plot: string (nullable = true)
 |-- plot_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_plot: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- matrix: vector (nullable = true)



In [29]:
term_doc_matrix1 = term_doc_matrix.selectExpr("movie_id","matrix as features", "ohe_genre as label")
term_doc_matrix_test1 = term_doc_matrix_test.selectExpr("movie_id","matrix as features")
term_doc_matrix1.count()

31109

In [30]:
# term_doc_matrix.printSchema()
# lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0, family="multinomial")
# ovr = OneVsRest(classifier=lr)
# lrModel = ovr.fit(term_doc_matrix1)
term_doc_matrix1.printSchema()
term_doc_matrix_test1.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: vector (nullable = true)

root
 |-- movie_id: integer (nullable = true)
 |-- features: vector (nullable = true)



In [31]:
term_doc_matrix1.select("label").head(1)

[Row(label=SparseVector(20, {0: 1.0, 5: 1.0}))]

In [32]:
term_doc_matrix1.show()

+--------+--------------------+--------------------+
|movie_id|            features|               label|
+--------+--------------------+--------------------+
|23890098|(10000,[125,179,4...|(20,[0,5],[1.0,1.0])|
|31186339|(10000,[2,4,7,11,...|(20,[0,4,10,17],[...|
|20663735|(10000,[1,5,9,11,...|(20,[0,4,16],[1.0...|
| 2231378|(10000,[7,8,12,14...|      (20,[1],[1.0])|
|  595909|(10000,[2,8,9,14,...|(20,[0,5,6],[1.0,...|
| 5272176|(10000,[2,3,6,12,...|(20,[0,3,4,10],[1...|
| 1952976|(10000,[0,1,2,3,4...|(20,[0,3,7],[1.0,...|
|24225279|(10000,[0,1,2,6,7...|      (20,[0],[1.0])|
| 2462689|(10000,[0,3,8,16,...|(20,[0,1,2,8,19],...|
|20532852|(10000,[10,30,37,...|(20,[12,13,15],[1...|
|15401493|(10000,[1,3,5,7,8...|      (20,[1],[1.0])|
|18188932|(10000,[0,2,19,33...|(20,[0,1,5,6],[1....|
| 2940516|(10000,[3,6,14,16...|      (20,[1],[1.0])|
| 1480747|(10000,[0,1,2,4,5...|      (20,[1],[1.0])|
|24448645|(10000,[0,9,54,72...|      (20,[7],[1.0])|
|15072401|(10000,[6,19,60,8...|(20,[3,6,7,18],

In [0]:
lrModels = []
for i in range(20):
    firstElement = f.udf(lambda v: float(v[i]), FloatType())

    new_df = term_doc_matrix1.withColumn("label", firstElement("label"))
    lr = LogisticRegression(maxIter=100, regParam=0.3, elasticNetParam=0)
    lrModel = lr.fit(new_df)
    lrModels.append(lrModel)

In [0]:
# for i in lrModels:
#     trainingSummary = i.summary
#     print(trainingSummary.accuracy)

In [0]:
predictions = []
for i in range(len(lrModels)):
    predicted = lrModels[i].transform(term_doc_matrix_test1)
    predicted = predicted.withColumn("prediction", predicted["prediction"].cast(IntegerType()))
    predictions.append(predicted)

In [36]:
predictions[0].where(f.col("prediction") == 1.0).count()

3757

In [37]:
predictions[0].printSchema()

root
 |-- movie_id: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: integer (nullable = true)



In [0]:
final_df = predictions[0].selectExpr("movie_id","prediction as predictions")
for i in range(1,len(predictions)):
    final_df = final_df.join(predictions[i].selectExpr("movie_id","prediction as prediction"+str(i)), ["movie_id"])
    final_df = final_df.withColumn("predictions", f.concat(f.col("predictions"), f.lit(" "), f.col("prediction"+str(i)))).select("movie_id","predictions")
# temp = predictions[0].selectExpr("features","prediction as prediction0").join(predictions[1].selectExpr("features","prediction as prediction1"), ["features"])

In [39]:
final_df.printSchema()
final_df.count()

root
 |-- movie_id: integer (nullable = true)
 |-- predictions: string (nullable = true)



7777

In [40]:
final_df.head(1)

[Row(movie_id=1335380, predictions='1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0')]

In [0]:
final_df.toPandas().to_csv("pred1.csv",index = False)