# **This notebook goes over the process of loading in the data set, cleaning the data and creating the corpus, dictionary and text files needed to train an LDA model using gensim**

Setting up directory on Google Drive

In [None]:
#@title Set up Directory { run: "auto"}
import os
from IPython.display import clear_output
from google.colab import drive 
from IPython.display import clear_output
drive.mount('/content/gdrive')
working_directory = 'My Drive/LDA_NewsData' #@param {type:"string"}
wd="/content/gdrive/"+working_directory
os.chdir(wd)

dirpath = os.getcwd()
print("current directory is : " + dirpath)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
current directory is : /content/gdrive/My Drive/LDA_NewsData


In [None]:
!pip install gensim



In [None]:
import pandas as pd

import spacy
spacy.load('en')
from spacy.lang.en import English

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer

import random

from gensim import corpora

import pickle

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df= pd.read_csv('CA_news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,authors,title,publish_date,description,text,url
0,0,[],'More vital now:' Gay-straight alliances go vi...,2020-05-03 1:30,Lily Overacker and Laurell Pallot start each g...,Lily Overacker and Laurell Pallot start each g...,https://www.cbc.ca/news/canada/calgary/gay-str...
1,1,[],Scientists aim to 'see' invisible transmission...,2020-05-02 8:00,Some researchers aim to learn more about how t...,"This is an excerpt from Second Opinion, a week...",https://www.cbc.ca/news/technology/droplet-tra...
2,2,['The Canadian Press'],Coronavirus: What's happening in Canada and ar...,2020-05-02 11:28,Canada's chief public health officer struck an...,The latest: The lives behind the numbers: Wha...,https://www.cbc.ca/news/canada/coronavirus-cov...
3,3,[],"B.C. announces 26 new coronavirus cases, new c...",2020-05-02 18:45,B.C. provincial health officer Dr. Bonnie Henr...,B.C. provincial health officer Dr. Bonnie Henr...,https://www.cbc.ca/news/canada/british-columbi...
4,4,[],"B.C. announces 26 new coronavirus cases, new c...",2020-05-02 18:45,B.C. provincial health officer Dr. Bonnie Henr...,B.C. provincial health officer Dr. Bonnie Henr...,https://www.cbc.ca/news/canada/british-columbi...


In [None]:
df= df.dropna()

#changing date to datetime format
df.publish_date= pd.to_datetime(df['publish_date'])
#creating columns for year and month 
df['year']= df['publish_date'].dt.year
df['month']= df['publish_date'].dt.month
#removing articles from 2012 and 2013
df=df[(df.year!=2013)&(df.year!=2012)]
df.sort_values(by=['year'],inplace=True)

In [None]:
#Function to clean out text and tokenize 
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for n in tokens:
        if n.orth_.isspace():
            continue
        elif n.like_url:
            lda_tokens.append('URL')
        elif n.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(n.lower_)
    return lda_tokens

In [None]:
#Function to lemmatize tokens
def lemmatize(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [None]:
#Funtion putting together tokenization and lemmatization
def prepare_text(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [lemmatize(token) for token in tokens]
    return tokens

In [None]:
#Apllying prepare_text() to data
en_stop = set(nltk.corpus.stopwords.words('english')) #list of english stop words

text_data=[]
for row in df['description']:
  tokens = prepare_text(row)
  text_data.append(tokens)

In [None]:
#create dictionary -> convert to bag-of-words corpus -> save to call upon later

dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(text_data, open('text_data.pkl', 'wb'))
pickle.dump(corpus, open('corpus_test.pkl', 'wb'))
dictionary.save('dictionary_test.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
