### Analytics Goal:
Use keywords extracted from social media account activity and recommended account acceptence/rejection history of a user to recommend him/her/them other interesting social media accounts to follow on the platform.   
### Machine Learning Algorithm:
Used keywords to train Word2Vec model and built profile vectors for user account and candidate account. These profile vectors are then used as features to train Logistic Regression model and predict if a user will follow the candidate account or reject the recommendation.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from functools import reduce
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import Word2Vec

In [0]:
# Initiate Spark Session
spark = SparkSession.builder \
            .appName("project") \
            .config("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
            .config("spark.network.timeout", "7200")\
            .getOrCreate()

In [0]:
# mongodb database details and creds
database = 'tencent'
collection = 'recologdetail'
user_name = 'cnayak'
password = 'NNcLhP2SdvbiK9X'
address = 'cluster0.mz09y.mongodb.net'
connection_string = f"mongodb+srv://{user_name}:{password}@{address}/{database}.{collection}"

In [0]:
# AWS s3 details
aws_access_key = '***'
aws_secret_key = '***'
spark._jsc.hadoopConfiguration().set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.1')
#spark._jsc.hadoopConfiguration().set('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:3.0.1') 
spark._jsc.hadoopConfiguration().set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
spark._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key)
spark._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_key)

In [0]:
spark._jsc.hadoopConfiguration().set('spark.network.timeout','7200s')
spark._jsc.hadoopConfiguration().set('spark.executor.heartbeatInterval','1200s')

#### Load data from mongodb


In [0]:
df = spark.read.format('mongo').option("uri",connection_string).load()

In [0]:
# Select keywords list for candidate accounts
item_kws  = df.select('Keywords_list').distinct()

In [0]:
 # In case there are non-integer type to be converted.
def IntegerSafe(value):
    try:
        return int(value)
    except:
        return None

In [0]:
# define schema for user keywords file
user_kw_schema = StructType([StructField("UserId", IntegerType(), False),\
    StructField("Keywords_weights", StringType(), True)])

In [0]:
# Load file containing keywords extracted from user social media activity like tweets, retweets, comments
user_kw_rdd = sc.textFile('s3://msds697-group18-2022/track1/user_key_word.txt')\
                .map(lambda x: x.split('\t'))\
                .map(lambda x : (IntegerSafe(x[0]), x[1]))

In [0]:
# convert RDD to dataframe
user_kw_df = spark.createDataFrame(user_kw_rdd, user_kw_schema)
user_kw_df.printSchema()

In [0]:
user_kw_df.show()

In [0]:
# get list of keywords for user account
def get_keywords(keywords):
    
    '''
    convert string Keywords weights in format "183:0.6673;2:0.35" 
    to  Arrays of keywords --> [183,2]
    '''
    kw_ws = [ x for x in keywords.split(';') ]
    kws = [ IntegerSafe(x.split(":")[0]) for x in kw_ws]
    return kws

In [0]:
# Register function for unwrapping of keywords into Array as UDF
get_kws = udf(get_keywords, ArrayType(IntegerType()))

In [0]:
user_kw_df = user_kw_df.withColumn('Keywords_list', get_kws('Keywords_weights'))

In [0]:
user_kw_df.show()

In [0]:
# combine user account keywords and item account keywords to train the Word2vec 
keywords_all = item_kws.unionByName(user_kw_df.select('Keywords_list'))

In [0]:
keywords_all.count()

In [0]:
# cast encoded keywords into string. requirement of Word2Vec model
def cast_keywords(keywords_list):
    return [str(x) for x in keywords_list]

In [0]:
# Register function for casting keywords to string
cast_kws = udf(cast_keywords, ArrayType(StringType()))

In [0]:
keywords_all = keywords_all.withColumn('Keywords', cast_kws('Keywords_list'))

In [0]:
keywords_all.show()

#### initialize Word2Vec model. embedding size is set to 5


In [0]:
word2Vec = Word2Vec(vectorSize=5, seed=42, inputCol="Keywords", outputCol="model")

In [0]:
# train the Word2Vec model for 10 iterations
word2Vec.setMaxIter(10)
model = word2Vec.fit(keywords_all)

In [0]:
# Embeddings for keywords
model.getVectors().show()

In [0]:
kw_vectors = model.getVectors().collect()

In [0]:
# convert keyword vectors dataframe to dictionary for fast lookup 
keyword_vectors_map = {}
for row in kw_vectors:
    keyword_vectors_map[int(row['word'])] = row['vector']

In [0]:
# select users and recommended accounts acceptance and rejection history 
log_df  = df.select('UserId','ItemId','Result')
data = log_df.withColumn("Result", F.when(log_df['Result']== -1,0).otherwise(log_df['Result']))

In [0]:
# get keywords for user account
data = data.join(user_kw_df.select('UserId','Keywords_list'), 'UserId', 'inner')\
           .withColumnRenamed('Keywords_list', 'User_keywords')

In [0]:
item_kw_df  = df.select('ItemId','Keywords_list').distinct()

In [0]:
# get keywords for candidate account
data = data.join(item_kw_df, 'ItemId', 'inner').withColumnRenamed('Keywords_list', 'Item_keywords')

In [0]:
# rename 'Result' column as label. Requirement for LogisticRegression model
data = data.withColumnRenamed('Result', 'label')

In [0]:
# get all the keyword embeddings for an account and sum them to get profile vector
def get_keyword_vector(keywords):
    default = [0.0,0.0,0.0,0.0, 0.0]
    default_vector = Vectors.dense(default)
    vectors = list(map(lambda x: keyword_vectors_map.get(x, default_vector), keywords))
    final_vector = reduce(lambda x,y: x+y, vectors)
    return final_vector  

In [0]:
# Register the udf to get summation of keyword embeddings
get_kws_vector = udf(get_keyword_vector, VectorUDT())

In [0]:
# get profile vector for User account
data_user_vector = data.withColumn('User_vector', get_kws_vector('User_keywords'))

In [0]:
data_user_vector.show()

In [0]:
# get profile vector for candidate account
data_all_vectors = data_user_vector.withColumn('Item_vector', get_kws_vector('Item_keywords'))

In [0]:
# combine all the features using vector assembler
va = VectorAssembler(outputCol="features", inputCols=['User_vector','Item_vector'])

In [0]:
lpoints = va.transform(data_all_vectors).select("features", "label")

In [0]:
#split into train and validation set
splits = lpoints.randomSplit([0.8,0.2])
train = splits[0].cache()
valid = splits[1].cache()

In [0]:
train.show()

In [0]:
# Intialize Logistic regression model. Maximum number of iterations is set to 10
lr = LogisticRegression(regParam=0.01, maxIter=10, fitIntercept=True)
lrmodel = lr.fit(train)

In [0]:
# predict labels for validation dataset
validpredicts = lrmodel.transform(valid)
validpredicts.show()

In [0]:
# get coefficients for each feature
print(lrmodel.coefficients)

In [0]:
bceval = MulticlassClassificationEvaluator()

In [0]:
# get the accuracy for the Logistic Regression model
bceval.setMetricName("accuracy")
print (bceval.getMetricName() +":" + str(bceval.evaluate(validpredicts)))

In [0]:
# get F1 score for the Logistic Regression model
bceval.setMetricName("f1") 
bceval.evaluate(validpredicts)