In [545]:
import pandas as pd 
import numpy as np
import glob
import nltk
#nltk.download('wordnet')
#nltk.download('punkt')
from textblob import Word
from collections import Counter
from nltk.tokenize import word_tokenize
import re
from sklearn.metrics.pairwise import cosine_similarity

In [546]:
# read in stopwords and make it a list of words
file = open(data_dir + "stopwords.txt", 'r')
stopwords = file.readlines()[0]
stopwords = stopwords.split(',')

In [547]:
# read in tweets
data_dir = '/data/'
file_list = list(glob.glob(data_dir + "U*.txt"))

In [548]:
# create a list of df which with structure tweet | user_id, and do text cleaning
df_list = []

for f in file_list:

    df = pd.read_csv(f, header=None, sep='\t')
    
    # extract district name from file name, and make it as a new column in each file
    df['user_id'] = f.split("/")[-1].split(".")[0].replace("U", "")
    
    # rename columns
    df.columns = ['tweet', 'user_id']
    
    # rearrange columns
    df[['user_id', 'tweet']]
    
    ## for df, do text mining
    # to lower
    df['tweet'] =  df['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

    # Remove Punctuation 
    df['tweet'] =  df['tweet'].str.replace('[^\w\s]','')
    
    # Remove Numbers
    df['tweet'] =  df['tweet'].str.replace('[0-9]', '')

    # Remove Stopwords
    df['tweet'] =  df['tweet'].apply(lambda x:" ".join(x for x in x.split() if x not in stopwords))

    # Lemmatization
    df['tweet'] =  df['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    
    df_list.append(df)

## Result 0: Dictionary size of each Twitter user

In [600]:
# create a list of dictionarys with top 10 freq
dictionary_list = []

for df in df_list:
    
    # combine all rows in the df as one corpus
    lst = []
    for i in range(df.shape[0]):
        lst.append(df['tweet'][i])
        corpus = " ".join(lst)
        
        # replace letters appeared >= twice as letters only appear once
        corpus = re.sub(r"(.)\1+", r"\1", corpus)
    
    # create the dictionary
    dictionary = Counter(corpus.split())
    dictionary = pd.DataFrame.from_dict(dictionary, orient='index').reset_index()
    dictionary.columns = ['V', 'freq']
    
    print("U" + df['user_id'][0] + "'s dictionary size is: {}.".format(dictionary.shape[0]))
   
    
    # select top 10 from dictionary
    dictionary = dictionary.sort_values('freq', ascending = False)
    
    dictionary = dictionary[0:10]
    
    # sort by word length and then word alphabe
    dictionary = dictionary.sort_values('V')
    dictionary['S'] = dictionary['V'].str.len()
    dictionary = dictionary.sort_values('S', ascending = False)
    
    #get user id
    dictionary['user_id'] = df['user_id'][0]
    
    # reset index
    dictionary = dictionary.reset_index(drop=True)
    
    # reorder columns
    dictionary = dictionary[['V', 'S', 'freq', 'user_id']]
    
    # append 11 dictionary to one dictionary_list
    dictionary_list.append(dictionary)

U11's dictionary size is: 1023.
U10's dictionary size is: 1127.
U4's dictionary size is: 2965.
U5's dictionary size is: 581.
U7's dictionary size is: 239.
U6's dictionary size is: 64.
U2's dictionary size is: 2583.
U3's dictionary size is: 1715.
U1's dictionary size is: 1487.
U8's dictionary size is: 8917.
U9's dictionary size is: 609.


In [551]:
# create a list of vector_df with only column 'S' from dictionary, and user_id as the column name
vector_list = []

for df in dictionary_list:
    
    user_id = df['user_id'][0]
    
    vector_df = pd.DataFrame(df['S'])
    
    vector_df.columns = [user_id]
    
    vector_list.append(vector_df)

In [552]:
# concat vector_list.
# the resulting 'vectors' table contains all vectors of the 11 users, with column names = user_id, index = ranking
vectors = pd.concat(vector_list, axis=1)

# change the column name order as 1,2,3 ..., 11
columns_in_order = sorted(list(map(int, vectors.columns.tolist())))
columns_in_order = map(str, columns_in_order)
vectors = vectors[columns_in_order]

# add 'U' to column name
user_in_order = vectors.columns.tolist()
user_in_order = ['U' + user for user in user_in_order]
print(user_in_order)
vectors.columns = [user_in_order]

# transpose, therefore change 'vectors' table to: index = user_id, columns names = ranking
vectors = vectors.transpose()
vectors

['U1', 'U2', 'U3', 'U4', 'U5', 'U6', 'U7', 'U8', 'U9', 'U10', 'U11']


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
U1,14,8,6,5,5,4,4,3,3,2
U2,8,7,7,7,6,6,5,5,2,1
U3,13,7,7,4,3,3,3,2,2,1
U4,11,8,6,5,5,4,4,3,1,1
U5,13,7,5,4,4,4,2,2,2,2
U6,12,10,7,4,4,4,4,3,3,2
U7,12,5,5,5,5,4,3,3,2,2
U8,10,8,6,5,5,4,4,3,3,3
U9,12,12,6,4,4,4,4,4,3,2
U10,11,6,4,4,4,4,2,2,2,2


In [553]:
# cosine_similarity among all vectors
similarities = cosine_similarity(vectors)
similarities = pd.DataFrame(similarities)
similarities

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,0.930117,0.98821,0.987582,0.99307,0.986238,0.990448,0.984163,0.972059,0.993842,0.980076
1,0.930117,1.0,0.901442,0.960774,0.900265,0.933187,0.935948,0.962327,0.921615,0.91863,0.889727
2,0.98821,0.901442,1.0,0.976334,0.987402,0.980707,0.97335,0.961905,0.959632,0.978324,0.977238
3,0.987582,0.960774,0.976334,1.0,0.975907,0.985586,0.977732,0.985586,0.975533,0.978751,0.974674
4,0.99307,0.900265,0.987402,0.975907,1.0,0.976237,0.985441,0.967538,0.961438,0.997261,0.983115
5,0.986238,0.933187,0.980707,0.985586,0.976237,1.0,0.959808,0.987684,0.993595,0.974292,0.988086
6,0.990448,0.935948,0.97335,0.977732,0.985441,0.959808,1.0,0.972155,0.938197,0.990975,0.951025
7,0.984163,0.962327,0.961905,0.985586,0.967538,0.987684,0.972155,1.0,0.977822,0.975552,0.964333
8,0.972059,0.921615,0.959632,0.975533,0.961438,0.993595,0.938197,0.977822,1.0,0.960649,0.988978
9,0.993842,0.91863,0.978324,0.978751,0.997261,0.974292,0.990975,0.975552,0.960649,1.0,0.976459


In [554]:
# replace the 1 (float) with null
arr = np.where(np.isclose(similarities.values, 1), np.nan, similarities.values)
similarities = pd.DataFrame(arr)

# change index and column names of the result as user_id
similarities.columns = [user_in_order]
similarities = similarities.set_index(pd.Index(user_in_order))
similarities

Unnamed: 0,U1,U2,U3,U4,U5,U6,U7,U8,U9,U10,U11
U1,,0.930117,0.98821,0.987582,0.99307,0.986238,0.990448,0.984163,0.972059,0.993842,0.980076
U2,0.930117,,0.901442,0.960774,0.900265,0.933187,0.935948,0.962327,0.921615,0.91863,0.889727
U3,0.98821,0.901442,,0.976334,0.987402,0.980707,0.97335,0.961905,0.959632,0.978324,0.977238
U4,0.987582,0.960774,0.976334,,0.975907,0.985586,0.977732,0.985586,0.975533,0.978751,0.974674
U5,0.99307,0.900265,0.987402,0.975907,,0.976237,0.985441,0.967538,0.961438,0.997261,0.983115
U6,0.986238,0.933187,0.980707,0.985586,0.976237,,0.959808,0.987684,0.993595,0.974292,0.988086
U7,0.990448,0.935948,0.97335,0.977732,0.985441,0.959808,,0.972155,0.938197,0.990975,0.951025
U8,0.984163,0.962327,0.961905,0.985586,0.967538,0.987684,0.972155,,0.977822,0.975552,0.964333
U9,0.972059,0.921615,0.959632,0.975533,0.961438,0.993595,0.938197,0.977822,,0.960649,0.988978
U10,0.993842,0.91863,0.978324,0.978751,0.997261,0.974292,0.990975,0.975552,0.960649,,0.976459


## Result 1: The most similar users were U5 and U10

In [557]:
similarities[similarities == similarities.max().max()]

Unnamed: 0,U1,U2,U3,U4,U5,U6,U7,U8,U9,U10,U11
U1,,,,,,,,,,,
U2,,,,,,,,,,,
U3,,,,,,,,,,,
U4,,,,,,,,,,,
U5,,,,,,,,,,0.997261,
U6,,,,,,,,,,,
U7,,,,,,,,,,,
U8,,,,,,,,,,,
U9,,,,,,,,,,,
U10,,,,,0.997261,,,,,,


## Result 2: The most dissimilar users were U2 and U11

In [556]:
similarities[similarities == similarities.min().min()]

Unnamed: 0,U1,U2,U3,U4,U5,U6,U7,U8,U9,U10,U11
U1,,,,,,,,,,,
U2,,,,,,,,,,,0.889727
U3,,,,,,,,,,,
U4,,,,,,,,,,,
U5,,,,,,,,,,,
U6,,,,,,,,,,,
U7,,,,,,,,,,,
U8,,,,,,,,,,,
U9,,,,,,,,,,,
U10,,,,,,,,,,,


## Result 3: Save the dictionaries and the similarity results 

In [601]:
# concat the dictionary_list
dictionary_all = pd.concat(dictionary_list).reset_index()

# create a new column rank_in_user
dictionary_all.columns = ['rank_in_user', 'V', 'S', 'freq', 'user_id']

# sort by user_id and rank_in_user
dictionary_all['user_id'] = pd.to_numeric(dictionary_all['user_id'], errors='coerce')
dictionary_all = dictionary_all.sort_values(['user_id', 'rank_in_user'])

# rearrange columns
dictionary_all = dictionary_all[['user_id', 'rank_in_user', 'V', 'S', 'freq']]

print(dictionary_all)

# save the dictionary_all
#dictionary_all.to_csv('dictionary_all.csv', index=False)
dictionary_all.to_csv('output1.txt', index=False)

# save the similarities
#similarities.to_csv('similarities.csv', index=False)
similarities.to_csv('output2.txt', index=False)

     user_id  rank_in_user                V   S  freq
80         1             0   camilynsoledad  14   167
81         1             1         birthday   8   162
82         1             2           thanks   6    55
83         1             3            thank   5    54
84         1             4            xfxfx   5    65
85         1             5             hapy   4   170
86         1             6             hope   4    62
87         1             7              day   3    79
88         1             8              god   3    55
89         1             9               rt   2   308
60         2             0         unlocked   8   840
61         2             1          android   7   307
62         2             2          prepaid   7   273
63         2             3          samsung   7   398
64         2             4           camera   6   238
65         2             5           mobile   6   176
66         2             6            black   5   195
67         2             7  