## Data - Prep
Follow instructions below to prepare files for data extraction.(Alternatively you can modify the code to upload the zip files and change the references)

1) Create the following folder in your google drive final-project/raw 

2) Upload the 4 zipped files included in the submission to the folder above.

3) Run the code in cell 1 to mound the drive.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import zipfile

with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2018q3_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2018q3')
with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2018q4_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2018q4')
with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2019q1_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2019q1')
with zipfile.ZipFile('/content/drive/My Drive/final-project/raw/2019q2_notes.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/My Drive/final-project/raw/extracted/2019q2')

In [0]:
with open("/content/drive/My Drive/final-project/raw/extracted/2018q3/txt.tsv") as myfile:
    head = [next(myfile) for x in range(10)]
print(head)

In [0]:
import pandas as pd

#folders = ['2018q3/','2018q4/','2019q1/','2019q2/']
folders = ['2018q3/', '2018q4/']
extract_folder = '/content/drive/My Drive/final-project/raw/extracted/'
context_file = 'txt.tsv'

filenames = []
for folder in folders:
  filename = extract_folder + folder + context_file
  filenames.append(filename)
print(filenames)
  
dfs = pd.concat([pd.read_csv(f, sep='\t') for f in filenames], ignore_index = True)
    
print(dfs.columns.values)

In [0]:
##print(dfs.loc[: , "value"])
#Fetch wordcount for each abstract
dfs['word_count'] = dfs['value'].apply(lambda x: len(str(x).split(" ")))
dfs[['value','word_count']].head()

In [0]:
##Descriptive statistics of word counts
dfs.word_count.describe()

In [0]:
#Identify common words
import pandas
#Identify common words
freq = pandas.Series(''.join(map(str,dfs['value'])).split()).value_counts()[:20]
freq

In [0]:
#Identify uncommon words
freq1 =  pandas.Series(''.join(map(str,dfs['value'])).split()).value_counts()[:-20]
freq1

# Pre Processing the text.
Now that we have the basic stats lets do some pre processing to remove noise and normalize the data.Data components that are redundant to the core text analytics can be considered as noise.

In [0]:
import nltk
import re
nltk.download('wordnet')
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer 


In [0]:
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
##Creating a list of custom stopwords
new_words = ['Jan', 'Janurary', 'Feb', 'February', 'March', 'April', 'May', 'Jun', 'June', 'July',
            'Aug', 'August', 'Sept', 'September', 'Oct', 'October', 'Nov', 'November', 'Dec', 'December', 
             'Month', 'Ended', 'Ending', 'Three', 'Period']
stop_words = stop_words.union(new_words)

In [0]:
corpus = []
for i in range(0, 5000):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', str(dfs['value'][i]))
    
    #Convert to lowercase
    text = text.lower()
    
    #remove tags
    text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
    # remove special characters and digits
    text=re.sub("(\\d|\\W)+"," ",text)
    
    ##Convert to list from string
    text = text.split()
    
    ##Stemming
    ps=PorterStemmer()
    #Lemmatisation
    lem = WordNetLemmatizer()
    text = [lem.lemmatize(word) for word in text if not word in  
            stop_words]
    text = [ word for word in text if len(word) > 3 ]
    if len(text) > 0:
      text = " ".join(text)
      corpus.append(text)

In [0]:
#View corpus item
corpus[:100]

#Data Exploration
We will now visualize the text corpus that we created after pre-processing to get insights on the most frequently used words.

In [0]:
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
% matplotlib inline
wordcloud = WordCloud(
                          background_color='white',
                          stopwords=stop_words,
                          max_words=100,
                          max_font_size=50, 
                          random_state=42
                         ).generate(str(corpus))
print(wordcloud)
fig = plt.figure(1)
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
fig.savefig("word1.png", dpi=900)

# Text preparation

Text in the corpus needs to be converted to a format that can be interpreted by the machine learning algorithms. There are 2 parts of this conversion — Tokenisation and Vectorisation.

For text preparation we use the bag of words model which ignores the sequence of the words and only considers word frequencies.

# Creating a vector of word counts

As the first step of conversion, we will use the CountVectoriser to tokenise the text and build a vocabulary of known words. We first create a variable “cv” of the CountVectoriser class, and then evoke the fit_transform function to learn and build the vocabulary.

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
import re
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
X=cv.fit_transform(corpus)

#Parameters passed to the Vectorizer function
cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))

max_df — When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). This is to ensure that we only have words relevant to the context and not commonly used words.

max_features — determines the number of columns in the matrix.

n-gram range — we would want to look at a list of single words, two words (bi-grams) and three words (tri-gram) combinations.

In [0]:
list(cv.vocabulary_.keys())[:50]

#Visualize top N uni-grams, bi-grams & tri-grams

We can use the CountVectoriser to visualise the top 20 unigrams, bi-grams and tri-grams.

In [0]:
#Most frequently occuring words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(corpus, n=20)
top_df = pandas.DataFrame(top_words)
top_df.columns=["Word", "Freq"]
#Barplot of most freq words
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
g = sns.barplot(x="Word", y="Freq", data=top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=30)

In [0]:
#Most frequently occuring Bi-grams
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),  
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top2_words = get_top_n2_words(corpus, n=20)
top2_df = pandas.DataFrame(top2_words)
top2_df.columns=["Bi-gram", "Freq"]
print(top2_df)
#Barplot of most freq Bi-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
h=sns.barplot(x="Bi-gram", y="Freq", data=top2_df)
h.set_xticklabels(h.get_xticklabels(), rotation=45)

In [0]:
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3), 
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(corpus, n=20)
top3_df = pandas.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
print(top3_df)
#Barplot of most freq Tri-grams
import seaborn as sns
sns.set(rc={'figure.figsize':(13,8)})
j=sns.barplot(x="Tri-gram", y="Freq", data=top3_df)
j.set_xticklabels(j.get_xticklabels(), rotation=45)


#Converting to a matrix of integers

The next step of refining the word counts is using the TF-IDF vectoriser. The deficiency of a mere word count obtained from the countVectoriser is that, large counts of certain common words may dilute the impact of more context specific words in the corpus. This is overcome by the TF-IDF vectoriser which penalizes words that appear several times across the document. TF-IDF are word frequency scores that highlight words that are more important to the context rather than those that appear frequently across documents.

TF-IDF consists of 2 components:

TF — term frequency

IDF — Inverse document frequency

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer
 
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(X)
# get feature names
feature_names=cv.get_feature_names()
 


In [0]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

# now print the results
context_keywords_file = open("/content/drive/My Drive/final-project/keywords/keyword-context.txt", "w+")

# fetch document for which keywords needs to be extracted
for doc in corpus:
  #generate tf-idf for the given document
  tf_idf_vector=tfidf_transformer.transform(cv.transform([doc]))  
  #sort the tf-idf vectors by descending order of scores
  sorted_items=sort_coo(tf_idf_vector.tocoo())
  #extract only the top n; n here is 10
  keywords=extract_topn_from_vector(feature_names,sorted_items,5)


  print("\nContext:")
  print(doc)
  print("\nKeywords:")
  for k in keywords:
      # only use keywords that have more than 2 words
      if len(k.split()) >= 2:
        print(k, keywords[k])
        context_keywords_file.write('{},{}\n'.format(doc,k))
        
context_keywords_file.close()

In [0]:
from google.colab import files
files.download('/content/drive/My Drive/final-project/keywords/keyword-context.txt')