In [55]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName("Retweet-Prediction").getOrCreate()
df= spark.read.csv("../data/train.data", sep= '\t')
label= spark.read.csv("../data/train.solution", sep= '\t')

In [56]:
print("Shape of the data: ", df.count(), ",", len(df.columns))
print("Shape of the label: ", label.count(),  ",", len(df.columns))

Shape of the data:  8151524 , 11
Shape of the label:  8151524 , 11


In [57]:
from functools import reduce

old_cols= df.columns
new_cols= ["Tweet_id", "Username", "Timestamp", "#Followers",
            "#Friends", "#Favorites", "Entities", "Sentiment", "Mentions",
            "Hashtags", "URLs"]
old_label_col= label.columns
new_label_col= ["label"]

df = reduce(lambda df, idx: df.withColumnRenamed(old_cols[idx], new_cols[idx]), range(len(old_cols)), df)
label = reduce(lambda label, idx: label.withColumnRenamed(old_label_col[idx], new_label_col[idx]), range(len(old_label_col)), label)

In [58]:
df.count()== label.count()

True

In [7]:
df.select("#Followers", "#Friends").show(3)

+----------+--------+
|#Followers|#Friends|
+----------+--------+
|       619|     770|
|     36365|   19344|
|      5018|    1933|
+----------+--------+
only showing top 3 rows



In [59]:
from pyspark.sql.functions import monotonically_increasing_id

df = df.withColumn("id", monotonically_increasing_id())
label = label.withColumn("id", monotonically_increasing_id())

In [60]:
df = df.join(label, "id", "inner").drop("id")

In [61]:
df.columns

['Tweet_id',
 'Username',
 'Timestamp',
 '#Followers',
 '#Friends',
 '#Favorites',
 'Entities',
 'Sentiment',
 'Mentions',
 'Hashtags',
 'URLs',
 'label']

In [17]:
df.select("#Followers", "#Friends", "label").show(3)

+----------+--------+-----+
|#Followers|#Friends|label|
+----------+--------+-----+
|        50|      99|    0|
|    667486|     372|  153|
|       134|    1124|    0|
+----------+--------+-----+
only showing top 3 rows



In [67]:
#data= df
df= df.select("#Followers", "#Friends", "#Favorites", "label")
df = df.withColumn("#Followers", df["#Followers"].cast("double"))
df = df.withColumn("#Friends", df["#Friends"].cast("double"))
df = df.withColumn("#Favorites", df["#Favorites"].cast("double"))
df = df.withColumn("label", df["label"].cast("double"))

In [68]:
df.dtypes

[('#Followers', 'double'),
 ('#Friends', 'double'),
 ('#Favorites', 'double'),
 ('label', 'double')]

In [71]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression

vecAssembler = VectorAssembler(inputCols=["#Favorites", "#Followers", "#Friends"], outputCol="features")
lr= LinearRegression()
pipeline = Pipeline(stages=[vecAssembler, lr])

trainDF, testDF = df.randomSplit([.7, .3], seed=42)
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select("#Favorites", "#Followers", "#Friends", "label", "prediction").show(5)

+----------+----------+--------+-----+----------------+
|#Favorites|#Followers|#Friends|label|      prediction|
+----------+----------+--------+-----+----------------+
|       0.0|       0.0|     0.0|  0.0|35.9098968748891|
|       0.0|       0.0|     0.0|  0.0|35.9098968748891|
|       0.0|       0.0|     0.0|  0.0|35.9098968748891|
|       0.0|       0.0|     0.0|  0.0|35.9098968748891|
|       0.0|       0.0|     0.0|  0.0|35.9098968748891|
+----------+----------+--------+-----+----------------+
only showing top 5 rows



In [72]:
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator= RegressionEvaluator()
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

R2 is 0.0038296589432468275


In [75]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

vecAssembler = VectorAssembler(inputCols=["#Favorites", "#Followers", "#Friends"], outputCol="features")
rf= RandomForestRegressor()
pipeline = Pipeline(stages=[vecAssembler, rf])

trainDF, testDF = df.randomSplit([.7, .3], seed=42)
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)
predDF.select("#Favorites", "#Followers", "#Friends", "label", "prediction").show(5)

+----------+----------+--------+-----+------------------+
|#Favorites|#Followers|#Friends|label|        prediction|
+----------+----------+--------+-----+------------------+
|       0.0|       0.0|     0.0|  0.0|33.297817827778815|
|       0.0|       0.0|     0.0|  0.0|33.297817827778815|
|       0.0|       0.0|     0.0|  0.0|33.297817827778815|
|       0.0|       0.0|     0.0|  0.0|33.297817827778815|
|       0.0|       0.0|     0.0|  0.0|33.297817827778815|
+----------+----------+--------+-----+------------------+
only showing top 5 rows



In [76]:
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator= RegressionEvaluator()
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

R2 is 0.0019903051652915282


## Exploring Sentiments

In [88]:
df= data.select("#Favorites", "#Followers", "#Friends", "Sentiment", "label")
df.show(3)

+----------+----------+--------+---------+-----+
|#Favorites|#Followers|#Friends|Sentiment|label|
+----------+----------+--------+---------+-----+
|         0|        50|      99|     2 -1|    0|
|       154|    667486|     372|     1 -1|  153|
|         0|       134|    1124|     1 -4|    0|
+----------+----------+--------+---------+-----+
only showing top 3 rows



In [89]:
from pyspark.sql.functions import split
sentiments= df.select(split(df.Sentiment," ")).rdd.flatMap(
              lambda x: x).toDF(schema=["pos","neg"])

In [90]:
from pyspark.sql.functions import monotonically_increasing_id

df = df.withColumn("id", monotonically_increasing_id())
sentiments = sentiments.withColumn("id", monotonically_increasing_id())
df = df.join(sentiments, "id", "inner").drop("id")
del sentiments

In [85]:
df.select("#Favorites", "#Followers", "#Friends", "pos", "neg", "label").show(3)

+----------+----------+--------+---+---+-----+
|#Favorites|#Followers|#Friends|pos|neg|label|
+----------+----------+--------+---+---+-----+
|         0|       107|     255|  1| -1|    0|
|         7|      1140|    1122|  3| -1|    1|
|         0|        52|     191|  2| -1|    0|
+----------+----------+--------+---+---+-----+
only showing top 3 rows



In [91]:
df = df.withColumn("#Followers", df["#Followers"].cast("double"))
df = df.withColumn("#Friends", df["#Friends"].cast("double"))
df = df.withColumn("#Favorites", df["#Favorites"].cast("double"))
df = df.withColumn("pos", df["pos"].cast("double"))
df = df.withColumn("neg", df["neg"].cast("double"))
df = df.withColumn("label", df["label"].cast("double"))

In [93]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor

vecAssembler = VectorAssembler(
    inputCols=["#Favorites", "#Followers", "#Friends", "pos", "neg"], 
    outputCol="features")

rf= RandomForestRegressor()
pipeline = Pipeline(stages=[vecAssembler, rf])

trainDF, testDF = df.randomSplit([.7, .3], seed=42)
pipelineModel = pipeline.fit(trainDF)
predDF = pipelineModel.transform(testDF)

In [94]:
from pyspark.ml.evaluation import RegressionEvaluator
regressionEvaluator= RegressionEvaluator()
r2 = regressionEvaluator.setMetricName("r2").evaluate(predDF)
print(f"R2 is {r2}")

R2 is 0.0025308919993899393


## Exploring Categorical Features

In [96]:
entities= data.select("entities")
entities.show(3, truncate= False)

+-------------------------------------------------------------------------+
|entities                                                                 |
+-------------------------------------------------------------------------+
|obese:Obesity:-2.3419573228805777;                                       |
|hong kong:Hong_Kong:-1.992910022963458;                                  |
|xenophobic:Xenophobia:-1.5869752955754004;china:China:-2.113921624336916;|
+-------------------------------------------------------------------------+
only showing top 3 rows



In [147]:
mentions= data.select("mentions")
mentions.show(3, truncate= False)

+--------------------------+
|mentions                  |
+--------------------------+
|carlquintanilla           |
|null;                     |
|SenSchumer realDonaldTrump|
+--------------------------+
only showing top 3 rows



In [99]:
hashtags= data.select("hashtags")
hashtags.show(3, truncate= False)

+----------------------+
|hashtags              |
+----------------------+
|null;                 |
|LIVE: hongkongprotests|
|null;                 |
+----------------------+
only showing top 3 rows



In [100]:
urls= data.select("URLs")
urls.show(3, truncate= False)

+----------------------+
|URLs                  |
+----------------------+
|null;                 |
|https://sc.mp/ainy1:-:|
|null;                 |
+----------------------+
only showing top 3 rows



### Addition of Categorical Features

1. Perform TF-IDF over the mentions and/ or entities/hastags/URLs with limited vocabulary and observe performance.
2. Repeat the process by enhancing the vocabulary and observe changes in performance.

### Extraction from Existing Features
1. Examine if the model behaves better by adding an average sentiment by summing the positive and negative sentiments
2. Observe by splitting the timestamp into days, month and year (dropping year if neeeded or performing One-Hot Encoding) if the individual days, month and/or year play some role in improving the model
3. Obtain the count of URLs in each tweet (probably, tweets with more links are more likely to get viral)

### 