In [50]:
## Packages need for data pre-process
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

from scipy import sparse
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

import itertools

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [62]:
# Import Dataset
df = pd.read_csv("f_jobs_tweets_sampled_three_month.csv", encoding= 'unicode_escape')
del df['Unnamed: 0']

In [63]:
df

Unnamed: 0,time,text,sn,date
0,2009-08-01 10:25:36,Now Hiring: Storage Architect II http://bit.l...,ChicagoJobAds,2009-08-01
1,2009-08-01 22:57:06,"""The Steve Jobs method"" discussion on Hacker N...",hnshah,2009-08-01
2,2009-08-01 23:27:08,AZ Jobs | Taco Bell Restaurant General Manager...,ZuluJobsAZ,2009-08-01
3,2009-08-01 09:55:12,"TN Jobs | SLP Travel Job in Knoxville Area, TN...",ZuluJobsTN,2009-08-01
4,2009-08-01 05:58:39,NJ Jobs | New Jersey Travel or Perm job- OT at...,ZuluJobsNJ,2009-08-01
...,...,...,...,...
27895,2009-11-01 02:15:14,these guys have to wake up. make him work alre...,yankee32879,2009-11-01
27896,2009-11-01 03:04:26,Therapy Jobs at HCR! Physical Therapist / PT -...,lydsterj2w,2009-11-01
27897,2009-11-01 00:21:24,hospitality jobs http://bit.ly/3XvUT1,Ur_WebInfoNews,2009-11-01
27898,2009-11-01 03:26:41,Obama Tempers Economic News With Caution On Jo...,suzanne_newton,2009-11-01


In [64]:
# Convert to lowercase and convert to list
data = df.text.str.lower().values.tolist()

# Remove Emails
data = [re.sub('@', '', sent) for sent in data]

# Remove hashtages
data = [re.sub('#', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove url
data = [re.sub(r'http\S+', '', sent) for sent in data]

# Remove space
data = [re.sub(r"\\n", " ", sent) for sent in data]

# Remove meaningless symbol
data = [re.sub(r"&amp", " ", sent) for sent in data]

# Remove number
data =  [''.join(i for i in sent if not i.isdigit()) for sent in data]
    
# Remove punctuation
data =  [re.sub(r"[,.;@#?!&$]+\ *", ' ', sent) for sent in data]

In [65]:
# simple_preprocess() tokenies the text
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:3])
print(len(data_words))

[['now', 'hiring', 'storage', 'architect', 'ii', 'jobs'], ['the', 'steve', 'jobs', 'method', 'discussion', 'on', 'hacker', 'news', 'via', 'ericries'], ['az', 'jobs', 'taco', 'bell', 'restaurant', 'general', 'manager', 'at', 'taco', 'bell', 'peoria', 'az', 'job', 'hiring', 'azjobs']]
27900


In [66]:
# Remove Stop Words
stop_words = stopwords.words('english')

# Define functions for stopwords and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

data_words_unigrams = remove_stopwords(data_words)

# Stemming

In [67]:
data = []
for i in data_words_unigrams:
    tweet = ' '.join(i)
    data.append(tweet)

In [68]:
def stemming(tweet, stem=True):
    ps = PorterStemmer()

    if stem==True:
        tweet = ' '.join([ps.stem(word) for word in tweet.split()])
    return tweet

data_stemming = [stemming(tweet) for tweet in data]

In [69]:
data_stemming1 = []
for i in data_stemming:
    alist = i.split()
    data_stemming1.append(alist)
    
data_stemming = data_stemming1

In [70]:
# Count unique words
merged = list(itertools.chain.from_iterable(data_stemming))
print(len(merged))
print(len(set(merged)))

# Identify words that appears at least 20 times
c = Counter(merged)
a = list(Counter({k: c for k, c in c.items() if c >= 20}).keys())

# Select words that appears at least 20 times
for i,value in enumerate(data_stemming):
    data_stemming[i] = [j for j in value if j in a] 
    
# Check
merged = list(itertools.chain.from_iterable(data_stemming))
print(len(set(merged)))

# Get the index of the doc that are deleted
empty_idx = []

for i, value in enumerate(data_stemming):
    if any(value) == False:
        empty_idx.append(i)
len(empty_idx)

# Delete empty elements
data_stemming2 = list(filter(None, data_stemming))
print(len(data_stemming2))

289174
21565
1829
27898


In [71]:
# Create Dictionary
id2word = corpora.Dictionary(data_stemming2)

# Create Corpus
texts = data_stemming2

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [72]:
a_s = gensim.matutils.corpus2dense(corpus, num_terms = 1829)

# Create Doc-word matrix
b_s = a_s.T.astype(np.float64)
print(b_s.shape)
print(b_s)

#savetxt('jobs_doc_word_matrix_stemmingf.csv', b_s, delimiter=',')

(27898, 1829)
[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [73]:
# Extract Document index
selected_idex = [x for x in list(df.index) if x not in empty_idx]

# Obtain remaining terms
words = [] 
for i,j in enumerate(id2word):
    a = id2word[i]
    words.append(a)

# Create a dataframe
b_ss = pd.DataFrame(b_s, columns=words, index=selected_idex)

In [74]:
b_ss

Unnamed: 0,architect,hire,ii,job,discuss,news,steve,via,az,azjob,...,advis,hazmat,caution,pogu,airway,effort,oct,iwow,persist,overst
0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27895,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27896,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27897,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
27898,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
#b_ss.to_csv("f_jobs_doc_word_matrix_stemmingf.csv")

In [78]:
doc_metadata = df.loc[selected_idex,:]
doc_metadata["tokenize"] =  data_stemming2
doc_metadata

Unnamed: 0,time,text,sn,date,tokenize
0,2009-08-01 10:25:36,Now Hiring: Storage Architect II http://bit.l...,ChicagoJobAds,2009-08-01,"[hire, architect, ii, job]"
1,2009-08-01 22:57:06,"""The Steve Jobs method"" discussion on Hacker N...",hnshah,2009-08-01,"[steve, job, discuss, news, via]"
2,2009-08-01 23:27:08,AZ Jobs | Taco Bell Restaurant General Manager...,ZuluJobsAZ,2009-08-01,"[az, job, taco, bell, restaur, gener, manag, t..."
3,2009-08-01 09:55:12,"TN Jobs | SLP Travel Job in Knoxville Area, TN...",ZuluJobsTN,2009-08-01,"[tn, job, slp, travel, job, knoxvil, area, tn,..."
4,2009-08-01 05:58:39,NJ Jobs | New Jersey Travel or Perm job- OT at...,ZuluJobsNJ,2009-08-01,"[nj, job, new, jersey, travel, perm, job, ot, ..."
...,...,...,...,...,...
27895,2009-11-01 02:15:14,these guys have to wake up. make him work alre...,yankee32879,2009-11-01,"[guy, make, work, alreadi, job]"
27896,2009-11-01 03:04:26,Therapy Jobs at HCR! Physical Therapist / PT -...,lydsterj2w,2009-11-01,"[therapi, job, hcr, physic, therapist, pt, prn..."
27897,2009-11-01 00:21:24,hospitality jobs http://bit.ly/3XvUT1,Ur_WebInfoNews,2009-11-01,"[hospit, job]"
27898,2009-11-01 03:26:41,Obama Tempers Economic News With Caution On Jo...,suzanne_newton,2009-11-01,"[obama, temper, econom, news, caution, job, mo..."


In [79]:
#doc_metadata.to_csv("f_jobs_stemming_meta_doc.csv")