In [1]:
import gensim

# Check this path
pwe_path = "DATASTORE/PWEs/glove/glove.6B.100d.w2vformat.txt"

model = gensim.models.KeyedVectors.load_word2vec_format(pwe_path)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [2]:
#Import all  dependencies
from sklearn import metrics
import gensim
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import os
import pandas as pd
import numpy as np
import sys
from datetime import datetime 
from scipy import spatial
#result = 1 - spatial.distance.cosine(v1, v2)

from gensim.test.utils import get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec
from multiprocessing import cpu_count

here = os.getcwd()
import json
import pathlib

from reviewFunctions import user2vec_test

import logging
logging.basicConfig(filename="glove6B_new.log",
                    filemode='a',
                    format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.CRITICAL)

logging.critical("\n PWE Experiment 3: Tweet embedding with Average Glove2(tweet) PWE - Train and Test Splitted %60 and %40")
logger = logging.getLogger('glove6B_new')

stop_words = set(stopwords.words('english'))


#saveJson(path_prefix, user_vecs, ctg_vecs, self.epoch) changedd
def saveJson(path_prefix, user_vecs_test, ctg_vecs_train, user_vecs_train, epoch):
    logging.critical("saveJSon started")
    path = get_tmpfile('{}/embeddings/users_test{}.json'.format(path_prefix,epoch))
    with open(path, 'w') as outfile:
        json.dump(user_vecs_test, outfile)
    outfile.close()

    path = get_tmpfile('{}/embeddings/categories_train{}.json'.format(path_prefix,epoch))
    with open(path, 'w') as outfile:
        json.dump(ctg_vecs_train, outfile)
    outfile.close()

    path = get_tmpfile('{}/embeddings/users_train{}.json'.format(path_prefix,epoch))
    with open(path, 'w') as outfile:
        json.dump(user_vecs_train, outfile)
    outfile.close()
    logging.critical("saveJSon finished")

/vol/ibrahim/.conda/envs/py37irh/bin/python


In [3]:
def preprocess(text):
    text = text.lower()
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()] #restricts string to alphabetic characters only
    return doc

def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

def get_user_dataframe(path):
    dataFrames = []
    for folder in os.listdir(path):
        if os.path.isdir(os.path.join(path, folder)):
            print("folder: ", folder)
            for file_name in os.listdir(os.path.join(path, folder)):
                if file_name.endswith(".csv"):
                    file_path = os.path.join(path, folder, file_name)
                    df = pd.read_csv(file_path,header=None,usecols=[3,0,2], 
                                     names=['tweet_id', 'date', 'user_name', 'text'])
                    df = df.astype(str)
                    df["category"] = folder
                    dataFrames.append(df)

    dfs = pd.concat(dataFrames)
    print("total ", len(dfs), " tweets")
    return dfs

In [4]:
def calc_iter_acc(model, dfs_train, dfs_test):
    # creaa
    user_vecs_train = {}
    ctg_vecs_train = {}
    user_vecs_test = {}
  
    logging.critical("beginning: of calc_iter_acc")
    logging.critical("first: inferring user_vecs_train")
    for index, datapoint in dfs_train.iterrows():
        unname = datapoint["user_name"]
        tid = datapoint["tweet_id"]
        category = datapoint["category"]
        tokenized_words = preprocess(datapoint["text"])      
        if has_vector_representation(model, tokenized_words):
            vec = document_vector(model, tokenized_words)         
            if unname in user_vecs_train.keys():
                user_vecs_train[unname]["vecs"].append(vec)
            else:
                user_vecs_train[unname] = {"vecs": [vec], "category": category}

    # find the average of tweet vectors for each user
    for unm in user_vecs_train.keys():
        user_vecs_train[unm]["avr_vec"] = np.average(np.array(user_vecs_train[unm]["vecs"]), axis=0)              
      
    #
    logging.critical("second: inferring user_vecs_test")
    for index, datapoint in dfs_test.iterrows():
        unname = datapoint["user_name"]
        tid = datapoint["tweet_id"]
        category = datapoint["category"]
        tokenized_words = preprocess(datapoint["text"])
        if has_vector_representation(model, tokenized_words):
            vec = document_vector(model, tokenized_words)       
            if unname in user_vecs_test.keys():
                user_vecs_test[unname]["vecs"].append(vec)
            else:
                user_vecs_test[unname] = {"vecs": [vec], "category": category}

    # find the average of tweet vectors for each user, dfs_test
    for unm in user_vecs_test.keys():
        user_vecs_test[unm]["avr_vec"] = np.average(np.array(user_vecs_test[unm]["vecs"]), axis=0)
    #
    
    # create category vector dictionary
    for unm in user_vecs_train.keys():
        avg = user_vecs_train[unm]["avr_vec"]
        ctg = user_vecs_train[unm]["category"]
        if ctg in ctg_vecs_train.keys():
            ctg_vecs_train[ctg]["cat_vecs"].append(avg)
        else:
            ctg_vecs_train[ctg] = {"cat_vecs": [avg]}

    # find the average of category vectors
    for ctg in ctg_vecs_train.keys():
        ctg_vecs_train[ctg]["avr_cat_vec"] = np.average(np.array(ctg_vecs_train[ctg]["cat_vecs"]), axis=0)  
    
    users_test = {}
    for usr in user_vecs_test.keys():
        users_test[usr] = {'avr_vec' : user_vecs_test[usr]["avr_vec"].tolist(), 'category': user_vecs_test[usr]["category"]}

    users_train = {}
    for usr in user_vecs_train.keys():
        users_train[usr] = {'avr_vec' : user_vecs_train[usr]["avr_vec"].tolist(), 'category': user_vecs_train[usr]["category"]}        
    
    categories_train = {}
    for ctg in ctg_vecs_train.keys():
        categories_train[ctg] = {'avr_cat_vec' : ctg_vecs_train[ctg]["avr_cat_vec"].tolist()}

    #update here
    logging.critical("end of: calc_iter_acc")
    uvt = user2vec_test(user_vecs=users_test, ctg_vecs=categories_train) # test model
    msg1 = uvt.calc_accuracy()
    print(msg1)
    logging.critical(msg1)
    msg2 = uvt.calc_accuracy_by_group()
    print(msg2)
    logging.critical(msg2)
   
    #conf matrix
    true_labels = []
    predicted_labes = []

    for unm in uvt.user_vecs.keys():
        true_labels.append(uvt.getUserCategory(unm))
        predicted_labes.append(uvt.most_similar_group(unm))

    # Print the confusion matrix
    msg3 = metrics.confusion_matrix(true_labels, predicted_labes)
    print(msg3)
    logging.critical(msg3)

    # Print the precision and recall, among other metrics
    msg4 = metrics.classification_report(true_labels, predicted_labes, digits=3)
    print(msg4)
    logging.critical(msg4)    
    
    path_prefix = os.getcwd()
    epoch = 0
    saveJson(path_prefix, users_test, categories_train, users_train,epoch)
    logging.critical("end of: calc_iter_acc")
    return "success"

In [5]:
# # Check paths
path_train = "DATASTORE/DatasetA/train"
path_test = "DATASTORE/DatasetA/test"

dfs_train = get_user_dataframe(path_train)
dfs_test = get_user_dataframe(path_test)

folder:  twcollector3
folder:  twcollector5
folder:  twcollector1
folder:  twcollector2
folder:  twcollector4
total  300000  tweets
folder:  twcollector3
folder:  twcollector5
folder:  twcollector1
folder:  twcollector2
folder:  twcollector4
total  200000  tweets


In [6]:
print(calc_iter_acc(model, dfs_train, dfs_test))

Total Acc is 167/200 = 0.835

overall acc. how many users are closest to their group 
 twcollector3 38/40 Acc 0.95
twcollector5 39/40 Acc 0.975
twcollector1 24/40 Acc 0.6
twcollector2 28/40 Acc 0.7
twcollector4 38/40 Acc 0.95


[[24  7  1  2  6]
 [ 5 28  5  1  1]
 [ 0  0 38  1  1]
 [ 0  1  1 38  0]
 [ 0  0  0  1 39]]
              precision    recall  f1-score   support

twcollector1      0.828     0.600     0.696        40
twcollector2      0.778     0.700     0.737        40
twcollector3      0.844     0.950     0.894        40
twcollector4      0.884     0.950     0.916        40
twcollector5      0.830     0.975     0.897        40

    accuracy                          0.835       200
   macro avg      0.833     0.835     0.828       200
weighted avg      0.833     0.835     0.828       200

success
