In [1]:
import findspark
findspark.init()

In [2]:
import re
import subprocess

from pyspark import SparkContext
from pyspark.sql import SparkSession
from get_data import get_all_memes, save_meme_tags, get_memedroid_data, get_twitter_data, get_imgur_data, get_reddit_data
from utils import train_model, get_popular_tag, get_source_upvotes, upload_blob, upload_to_bucket
from pyspark.ml import PipelineModel
from google.cloud import storage

sc = SparkContext.getOrCreate()
spark = SparkSession.builder.getOrCreate()

In [62]:
number_of_clusters = 10

# memes_df = get_all_memes(spark)
# memes_df = memes_df.cache()
# memes_df_train = memes_df.drop_duplicates(subset=['text'])
# model = train_model(memes_df_train, number_of_clusters, 50)

model = PipelineModel.load("hdfs:///models/model")
X = model.transform(memes_df)
X = X.select('id', 'url', 'image_path', 'source', 'prediction','additional_data')
X = X.cache()

In [63]:
X.groupby('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1| 2844|
|         6|  709|
|         3|30751|
|         5| 1093|
|         9| 5886|
|         4|38346|
|         8| 6791|
|         7| 8018|
|         2| 6742|
|         0| 8232|
+----------+-----+



In [64]:
def get_popular_tag(list_of_tags, cluster_names):
    """From list_of_tags return the most popular, unique tag"""
    
    i = 0
    tags_to_omit = ['', 'memes', 'meme', 'funny','dump', 'memedump', 'dankmemes', 'reddit', 'description', 'twitter', 'lol', 'thumbnailhash']
    while True:
        if list_of_tags[i][0] not in tags_to_omit and list_of_tags[i][0] not in cluster_names:
            if list_of_tags[i][0] == 'randommemedump':
                list_of_tags[i] = list(list_of_tags[i])
                list_of_tags[i][0] = 'other'
                list_of_tags = tuple(list_of_tags)
            return list_of_tags[i][0]
        i += 1

In [65]:
save_meme_tags(X, number_of_clusters)

cluster_names = []
for cluster_id in range(number_of_clusters):
    words = sc.textFile("hdfs:///tags/all_tags_{0}".format(cluster_id)).flatMap(lambda line: line.split(" "))
    wordCounts = words.map(lambda word: (re.sub('[^A-Za-z0-9]+', '', word.lower()), 1)).reduceByKey(lambda a,b:a +b)
    tags = wordCounts.sortBy(lambda atuple: atuple[1], ascending = False).collect()
    cluster_names.append(get_popular_tag(tags, cluster_names))

In [66]:
cluster_names

['awesome',
 'youtube',
 'wholesome',
 'other',
 'humor',
 'tiktok',
 'asua4r3',
 'gaming',
 'history',
 'coronavirus']

In [67]:
tmp = spark.createDataFrame([(i, cluster_names[i]) for i in range(number_of_clusters)], ["prediction", "cluster"])

memedroid = get_memedroid_data(memes_df)
memedroid = get_source_upvotes(memedroid, 'upvotes', 'memedroid')
memedroid.cache()
twitter = get_twitter_data(memes_df)
twitter = get_source_upvotes(twitter, 'favorite_count', 'twitter')
twitter.cache()
reddit = get_reddit_data(memes_df)
reddit = get_source_upvotes(reddit, 'upvotes', 'reddit')
reddit.cache()
imgur = get_imgur_data(memes_df)
imgur = get_source_upvotes(imgur, 'ups', 'imgur')
imgur.cache()

all_upvotes = memedroid.union(twitter).union(reddit).union(imgur)

In [68]:
X = X.join(all_upvotes, on = ['id'], how="left_outer")

X = X.join(tmp, on = ['prediction'], how="left_outer")

X = X.na.drop()

In [69]:
X.printSchema()

root
 |-- prediction: integer (nullable = false)
 |-- id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- image_path: string (nullable = true)
 |-- source: string (nullable = true)
 |-- additional_data: string (nullable = true)
 |-- upvotes: integer (nullable = true)
 |-- upvotes_centile: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- cluster: string (nullable = true)



In [70]:
#if all to be uploaded
X = X.select('id', 'url', 'image_path', 'source', 'timestamp', 'upvotes', 'upvotes_centile', 'cluster')

X.coalesce(1).write.format('json').mode("overwrite").save('hdfs:///json_bucket')


upload_to_bucket('/json_bucket',  "tmp-data-for-website/all_data.json")

File /home/data_to_upload_on_bucket/tmp.json uploaded to tmp-data-for-website/all_data.json.


Daily data

In [9]:
Y = X.select('id', 'upvotes', 'upvotes_centile', 'cluster')

Y.coalesce(1).write.format('json').mode("overwrite").save('hdfs:///json_bucket')


upload_to_bucket('/json_bucket')

File /home/data_to_upload_on_bucket/tmp.json uploaded to tmp-data-for-website/daily/daily.json.


### Visualizations for report

#### PCA

In [71]:
import matplotlib.pyplot as plt
import numpy as np

from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.feature import PCA
from pyspark.sql import functions
from pyspark.sql.types import StringType, DoubleType
from pyspark.sql.functions import translate

In [72]:
X = model.transform(memes_df)

In [73]:
evaluator = ClusteringEvaluator(featuresCol='features')
silhouette = evaluator.evaluate(X)

In [74]:
centers = model.__dict__['stages'][4].clusterCenters()

In [75]:
pca = PCA(k=2, inputCol="features", outputCol="pcaFeatures")
model_pca = pca.fit(X)
X = model_pca.transform(X)

In [76]:
centers_pca = [None]*len(centers)
for i in range(len(centers)):
    centers_pca[i] = np.multiply(model_pca.pc.toArray().T, centers[i]).sum(axis = 1)
centers_pca = np.array(centers_pca)

In [None]:
memes_df = X

split_col = functions.split(memes_df["pcaFeatures"].cast(StringType()), ',')
memes_df = memes_df.withColumn('x', translate(split_col.getItem(0), "[", "").cast(DoubleType()))
memes_df = memes_df.withColumn('y', translate(split_col.getItem(1), "]", "").cast(DoubleType()))

df = memes_df.toPandas()
groups = df.groupby('prediction')
fig, ax = plt.subplots()
ax.margins(0.05)
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=5, label=name)
    ax.text(centers_pca[name,0], centers_pca[name,1], s = name, fontsize = 10)
ax.legend(loc='lower right', prop={'size': 9})
ax.title.set_text("k={0}, wn={1}, Silhouette={2}".format(10,50,silhouette))
plt.show()
print("PCA, explained variance= {0}".format(model_pca.explainedVariance))

In [12]:
fig.savefig('clusterization_w2v2.png')