In [1]:
## Packages need for data pre-process
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

from scipy import sparse
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.probability import FreqDist

# word normalization (remove duplicate letters (e.g. llllooooovvvveee -> love))
nltk.download('wordnet')
import re
from nltk.corpus import wordnet
from repeatedReplacer import RepeatReplacer 
replacer = RepeatReplacer()

import itertools
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# read in functions from 'preprocessingFunctions.py'
import preprocessingFunctions 

[nltk_data] Downloading package wordnet to /home/rep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /home/rep/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### - Tweets from 4 distinct users

In [19]:
# Import Dataset
import os 
os.chdir("/home/rep/scRNA-seq_clustering_to_Twitter/P1_preprocessing")
os.getcwd()
four = pd.read_csv('four_users.csv')
del four['Unnamed: 0']
print(four.shape)
four.head

(12780, 4)


<bound method NDFrame.head of         user_id  user_id_new      screen_name  \
0      27902825            2    UMichFootball   
1      27902825            2    UMichFootball   
2      27902825            2    UMichFootball   
3      27902825            2    UMichFootball   
4      27902825            2    UMichFootball   
...         ...          ...              ...   
12775  19071682            3  breakingweather   
12776  19071682            3  breakingweather   
12777  19071682            3  breakingweather   
12778  19071682            3  breakingweather   
12779  19071682            3  breakingweather   

                                                    text  
0                              👇 https://t.co/swtsZWWaJe  
1      Leave it all on the field! @UMichFootball! Bes...  
2      There’s no time to look backwards… only ahead!...  
3         2️⃣4️⃣:0️⃣0️⃣:0️⃣0️⃣ ⏳ https://t.co/eM3yUXJXaq  
4      It’s called “The Game’ for a reason. \r\n\r\n#...  
...                        

In [3]:
# Convert to lowercase and convert to list
data = four.text.str.lower().values.tolist()
data = [preprocessingFunctions.preProcessingFcn(tweet) for tweet in data]

In [4]:
# tokenize the tweets and remove punctuations
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [5]:
# Remove Stop Words
stop_words = stopwords.words('english')
data_words_unigrams = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]

In [6]:
# Stemming
data = []
for i in data_words_unigrams:
    tweet = ' '.join(i)
    data.append(tweet)

data_stemming = [preprocessingFunctions.stemming(tweet) for tweet in data]

data_stemming_temp = []
for i in data_stemming:
    alist = i.split()
    data_stemming_temp.append(alist)
    
data_stemming = data_stemming_temp

In [7]:
# Remove 80% of the least frequent words
words_dict, data_stemming1, empty_idx = preprocessingFunctions.trim_noise(data_stemming, 80)

Proportion of remaining tweets w.r.t. original tweets: 97.86%
Proportion of removed tweets w.r.t. original tweets: 2.14%


In [8]:
# The lowest word frequency in the remaining tweets 
min(words_dict.values())

10

In [9]:
#######################################
##### Create document-term matrix #####
#######################################

# Create Dictionary
id2word = corpora.Dictionary(data_stemming1)

# Create Corpus
texts = data_stemming1

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

a_s = gensim.matutils.corpus2dense(corpus, num_terms = len(words_dict))
b_s = a_s.T.astype(np.float64)

# Extract Document index
selected_idex = [x for x in list(four.index) if x not in empty_idx]

# Obtain remaining terms
words = [] 
for i,j in enumerate(id2word):
    words.append(id2word[i])

# Create a dataframe for the document-term matrix
b_ss = pd.DataFrame(b_s, columns=words, index=selected_idex)
print(b_ss.shape)
print(b_ss)

(12507, 2043)
       beatosu  best  colleg  field  footbal  goblu  leav  rivalri  \
1          1.0   1.0     1.0    1.0      1.0    1.0   1.0      1.0   
2          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
4          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
5          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
7          1.0   0.0     0.0    0.0      0.0    1.0   0.0      0.0   
...        ...   ...     ...    ...      ...    ...   ...      ...   
12775      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12776      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12777      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12778      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   
12779      0.0   0.0     0.0    0.0      0.0    0.0   0.0      0.0   

       umichfootbal  ahead  ...  antil  lesser  uptick  dorian  \
1               1.0    0.0  ...    0.0     0.0     0.0     0.0   
2            

In [10]:
# obtain attributes for the remaining tweet 
four_after = four.drop(empty_idx, axis=0)

tweets_processed = []
for i in data_stemming1:
    tweet = ' '.join(i)
    tweets_processed.append(tweet)

four_after['tweets_processed'] = list(tweets_processed)

In [None]:
# b_ss.to_csv("doc_word_matrix_stemming_four_users.csv")
# four_after.to_csv("doc_metadata_stemming_four_users.csv")

### - "jobs" Tweets

In [11]:
# Import Dataset
jobs = pd.read_csv("jobs_tweets_sampled_three_month.csv", encoding= 'unicode_escape')
del jobs['Unnamed: 0']

print(jobs.shape)
jobs.head

(27900, 4)


<bound method NDFrame.head of                       time                                               text  \
0      2009-08-01 10:25:36  Now Hiring:  Storage Architect II http://bit.l...   
1      2009-08-01 22:57:06  "The Steve Jobs method" discussion on Hacker N...   
2      2009-08-01 23:27:08  AZ Jobs | Taco Bell Restaurant General Manager...   
3      2009-08-01 09:55:12  TN Jobs | SLP Travel Job in Knoxville Area, TN...   
4      2009-08-01 05:58:39  NJ Jobs | New Jersey Travel or Perm job- OT at...   
...                    ...                                                ...   
27895  2009-11-01 02:15:14  these guys have to wake up. make him work alre...   
27896  2009-11-01 03:04:26  Therapy Jobs at HCR! Physical Therapist / PT -...   
27897  2009-11-01 00:21:24              hospitality jobs http://bit.ly/3XvUT1   
27898  2009-11-01 03:26:41  Obama Tempers Economic News With Caution On Jo...   
27899  2009-11-01 03:21:23  EXCITING, getting ready for my 1st job test =D...  

In [12]:
# Convert to lowercase and convert to list
data = jobs.text.str.lower().values.tolist()
data = [preprocessingFunctions.preProcessingFcn(tweet) for tweet in data]

In [13]:
# tokenize the tweets and remove punctuations
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

In [14]:
# Remove Stop Words
stop_words = stopwords.words('english')
data_words_unigrams = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in data_words]

In [15]:
# Stemming
data = []
for i in data_words_unigrams:
    tweet = ' '.join(i)
    data.append(tweet)

data_stemming = [preprocessingFunctions.stemming(tweet) for tweet in data]

data_stemming_temp = []
for i in data_stemming:
    alist = i.split()
    data_stemming_temp.append(alist)
    
data_stemming = data_stemming_temp

In [16]:
# Remove 90% of the least frequent words
words_dict, data_stemming1, empty_idx1 = preprocessingFunctions.trim_noise(data_stemming, 90)

Proportion of remaining tweets w.r.t. original tweets: 99.99%
Proportion of removed tweets w.r.t. original tweets: 0.01%


In [17]:
# The lowest word frequency in the remaining tweets 
min(words_dict.values())

15

In [18]:
# print the removed tweets 
for i in empty_idx1:
    print(jobs.iloc[[i]].text)

282    http://bit.ly/rXYm5 :: e_jobs: &#10148;Concurs...
Name: text, dtype: object
13865    legitimate_telecommute_jobs  http://bit.ly/16tkOq
Name: text, dtype: object


In [19]:
#######################################
##### Create document-term matrix #####
#######################################

# Create Dictionary
id2word = corpora.Dictionary(data_stemming1)

# Create Corpus
texts = data_stemming1

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

a_s = gensim.matutils.corpus2dense(corpus, num_terms = len(words_dict))
b_s = a_s.T.astype(np.float64)

# Extract Document index
selected_idex = [x for x in list(jobs.index) if x not in empty_idx1]

# Obtain remaining terms
words = [] 
for i,j in enumerate(id2word):
    words.append(id2word[i])

# Create a dataframe
b_ss = pd.DataFrame(b_s, columns=words, index=selected_idex)
print(b_ss.shape)
print(b_ss)

(27898, 2146)
       architect  hire   ii  job  discuss  news  steve  via   az  azjob  ...  \
0            1.0   1.0  1.0  1.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
1            0.0   0.0  0.0  1.0      1.0   1.0    1.0  1.0  0.0    0.0  ...   
2            0.0   1.0  0.0  2.0      0.0   0.0    0.0  0.0  2.0    1.0  ...   
3            0.0   1.0  0.0  3.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
4            0.0   1.0  0.0  3.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
...          ...   ...  ...  ...      ...   ...    ...  ...  ...    ...  ...   
27895        0.0   0.0  0.0  1.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
27896        0.0   0.0  0.0  2.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
27897        0.0   0.0  0.0  1.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   
27898        0.0   0.0  0.0  1.0      0.0   2.0    0.0  0.0  0.0    0.0  ...   
27899        0.0   0.0  0.0  2.0      0.0   0.0    0.0  0.0  0.0    0.0  ...   

       airway  australian

In [20]:
# Remove the words that appear in >= 80% of the tweets
word_dict, data_stemming2, b_ss_f, empty_idx2 = preprocessingFunctions.trim_common(b_ss, 80, data_stemming1)
print(b_ss_f.shape)

Proportion of remaining tweets w.r.t. original tweets: 99.45%
Proportion of removed tweets w.r.t. original tweets: 0.55%
(27744, 2145)


In [21]:
# Obtain the idex of all empty tweets after pre-processing
empty_idx = empty_idx1 + empty_idx2

In [22]:
# obtain attributes for the remaining tweet 
jobs_after = jobs.drop(empty_idx, axis=0)

tweets_processed = []
for i in data_stemming2:
    tweet = ' '.join(i)
    tweets_processed.append(tweet)

jobs_after['tweets_processed'] = list(tweets_processed)

In [None]:
# b_ss_f.to_csv("doc_word_matrix_stemming_jobs.csv")
# jobs_after.to_csv("doc_metadata_stemming_jobs.csv")

# Ferg's distance matrix
##### Reference: https://deepblue.lib.umich.edu/handle/2027.42/163193
##### Ferg, Robyn. Modern Survey Estimation with Social Media and Auxiliary Data. Diss. 2020.

In [23]:
## packages
import os 
import csv
import numpy as np
from pyclustering.cluster.kmedoids import kmedoids
import pandas as pd
import statsmodels.api as sm # table
from sklearn.decomposition import LatentDirichletAllocation as LDA
import random 


## first, read in functions from 'distanceMatrices.py'
import distanceMatrices

#### (1) tweets from four distinct users

In [None]:
minMentions = 0
wordDistMethod = 'condProb'
newMethod1 = distanceMatrices.makeMatrices(list(four_after.tweets_processed), minMentions=minMentions, preProcess=False, wordDistMethod=wordDistMethod)

In [None]:
# tweet_distance_matrix = newMethod1['s']
# tweet_distance_matrix = np.asmatrix(tweet_distance_matrix)
# tweet_distance_matrix = pd.DataFrame(tweet_distance_matrix)
# tweet_distance_matrix.to_csv("tweet_distance_matrix_four_users.csv")

In [None]:
# word_distance_matrix = newMethod1['d']
# word_distance_matrix = np.asmatrix(word_distance_matrix)
# word_distance_matrix = pd.DataFrame(word_distance_matrix)
# word_distance_matrix.to_csv("word_distance_matrix_four_users.csv")

#### (2) "jobs" tweets

In [114]:
minMentions = 0
wordDistMethod = 'condProb'
newMethod = distanceMatrices.makeMatrices(list(jobs_after.tweets_processed), minMentions=minMentions, preProcess=False, wordDistMethod=wordDistMethod)

Creating word matrix
Making co-occurrence matrix
Making word transition matrix
Making word distance matrix
Making tweet distance matrix


In [115]:
# tweet_distance_matrix = newMethod['s']
# tweet_distance_matrix = np.asmatrix(tweet_distance_matrix)
# tweet_distance_matrix = pd.DataFrame(tweet_distance_matrix)
# tweet_distance_matrix.to_csv("tweet_distance_matrix_jobs.csv")

In [116]:
# word_distance_matrix = newMethod['d']
# word_distance_matrix = np.asmatrix(word_distance_matrix)
# word_distance_matrix = pd.DataFrame(word_distance_matrix)
# word_distance_matrix.to_csv("word_distance_matrix_jobs.csv")