# Read the data ###

In [None]:
# Let's read in our document-term matrix
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# graph_test_4 (french)
# graph_test (english)

data = pd.read_csv('data/graph_test_4.csv') #dataset with title
data = data.sort_values(by='modularity_class')
data = data.drop(columns=['timeset'])

data_e = pd.read_csv('data/extract_fr.csv') #dataset with scraped extracts (summary of page)from web 
#data_t = pd.read_csv('data/texts.csv') #dataset with scraped extracts (texts of page)from web

df = data.join(data_e.set_index('id'), on='Id')
#df = df.join(data_t.set_index('id'), on='Id')
df = df.dropna()
df = df.drop(columns=['Label'])
df = df.drop(columns=['Unnamed: 0'])

In [None]:
df.head()

In [None]:
def chunks(l, n):
    n = max(1, n)
    return (l[i:i+n] for i in range(0, len(l), n))

## Complete the dataset - web scraping ###

In [None]:
# import requests
# import json

# df = pd.DataFrame(columns=['id', 'title', 'extract'])

# ids = sorted(data.Id.tolist())
# id_chunks = chunks(ids, 49)
# for chunk in id_chunks:
#     string_ids = ""
#     for idd in chunk:
#         string_ids = string_ids + str(idd) + '|'
#     base_url = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&pageids='
#     url = base_url + string_ids[:-1]
#     res = requests.get(url)
#     response = json.loads(res.text)
#     print(response)
#     for key in response['query']['pages']:
#         try:
#             pageid = response['query']['pages'][key]['pageid']
#             title = response['query']['pages'][key]['title']
#             extract = response['query']['pages'][key]['extract']
#             df = df.append({'id': pageid, 'title': title, 'extract': extract}, ignore_index=True)
#         except KeyError as e:
#             pass
# #             print(e, key)

In [None]:
# import requests
# import json

# df = pd.DataFrame(columns=['id', 'title', 'extract'])

# ids = sorted(data.Id.tolist())
# for idd in ids:
#     base_url = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&pageids='
#     url = base_url + str(idd)
#     res = requests.get(url)
#     response = json.loads(res.text)
#     for key in response['query']['pages']:
#         try:
#             pageid = response['query']['pages'][key]['pageid']
#             title = response['query']['pages'][key]['title']
#             extract = response['query']['pages'][key]['extract']
#             df = df.append({'id': pageid, 'title': title, 'extract': extract}, ignore_index=True)
#         except KeyError as e:
#             print(e, key)

In [None]:
# import requests
# import json

# df = pd.DataFrame(columns=['id', 'text'])

# ids = sorted(data.Id.tolist())
# for idd in ids:
#     base_url = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&redirects=true&pageids='
#     #base_url = 'https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&pageids='
#     url = base_url + str(idd)
#     res = requests.get(url)
#     response = json.loads(res.text)
#     for key in response['query']['pages']:
#         try:
#             pageid = response['query']['pages'][key]['pageid']
#             texts = response['query']['pages'][key]['extract']
#             df = df.append({'id': pageid, 'text': texts}, ignore_index=True)
#         except KeyError as e:
#             print(e, key)

In [None]:
grouped_title = df.groupby("modularity_class")['title'].apply(' '.join).reset_index()

In [None]:
grouped_extract = df.groupby("modularity_class")['extract'].apply(' '.join).reset_index()

In [None]:
grouped_extract.head()

# Clean the data

In [None]:
import re
import string

# Apply a first round of text cleaning techniques
def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.strip()
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]', u'', text)  #remove non-latin chars 
    return text

round1 = lambda x: clean_text_round1(x)

In [None]:
data_clean = pd.DataFrame(grouped_extract.extract.apply(round1))
data_clean.head()

In [None]:
# Let's pickle it for later use
data_clean.to_pickle("data_clean.pkl")

In [None]:
fig, ax = plt.subplots()
ax.bar(data_clean.extract.str.len().sort_index().keys(), data_clean.extract.str.split().str.len())
plt.xlabel('Document Index');
plt.ylabel('Number of words');

# Document-Term Matrix

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.extract)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

In [None]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [None]:
# Find the top 30 words in each cluster
top_dict = {}
for c in data_dtm.transpose().columns:
    top = data_dtm.transpose()[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each cluster
words = []
for cluster in data_dtm.transpose().columns:
    top = [word for (word, count) in top_dict[cluster]]
    for t in top:
        words.append(t)
        
words

In [None]:
# If more than half of the cluster have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 10]
add_stop_words.append('mort')
add_stop_words

<span style="color:red">*french version*</span>

In [None]:
from stop_words import get_stop_words
stop_words = get_stop_words('fr')

In [None]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Add new stop words
stop_words = get_stop_words('fr') +add_stop_words #text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.extract)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index

# Pickle it for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")

# Topic Modeling 

### First Attempt : all the words

In [None]:
# Import the necessary modules for LDA with gensim
from gensim import matutils, models
import scipy.sparse

In [None]:
# One of the required inputs is a term-document matrix
tdm = data_stop.transpose()
tdm.head()

In [None]:
# We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
sparse_counts = scipy.sparse.csr_matrix(tdm)
corpus = matutils.Sparse2Corpus(sparse_counts)

In [None]:
# Gensim also requires dictionary of the all terms and their respective location in the term-document matrix
cv = pickle.load(open("cv_stop.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [None]:
# Now that we have the corpus (term-document matrix) and id2word (dictionary of location: term),
# we need to specify two other parameters as well - the number of topics and the number of passes
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=40)
lda.print_topics()

### Second Attempt : nouns only

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    # tokenized = word_tokenize(text)
    tokenized = word_tokenize(text,language='french')
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [None]:
# Read in the cleaned data, before the CountVectorizer step
data_clean = pd.read_pickle('data_clean.pkl')
data_clean

In [None]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data_clean.extract.apply(nouns))
data_nouns

In [None]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
stop_words = get_stop_words('fr') +add_stop_words #text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(data_nouns.extract)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

In [None]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [None]:
# Let's start with 2 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=2, id2word=id2wordn, passes=10)
ldan.print_topics()

In [None]:
# Let's start with more topics
ldan = models.LdaModel(corpus=corpusn, num_topics=4, id2word=id2wordn, passes=10)
ldan.print_topics()

### Third Attempt : nouns and adjectives only

In [None]:
# Let's create a function to pull out nouns from a string of text
def nouns_adj(text):
    '''Given a string of text, tokenize the text and pull out only the nouns and adjectives.'''
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    # tokenized = word_tokenize(text)
    tokenized = word_tokenize(text,language='french')
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)] 
    return ' '.join(nouns_adj)

In [None]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns_adj = pd.DataFrame(data_clean.extract.apply(nouns_adj))
data_nouns_adj

In [None]:
# Create a new document-term matrix using only nouns and adjectives, also remove common words with max_df
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.extract)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna

In [None]:
# Create the gensim corpus
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# Create the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [None]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

In [None]:
# Let's start with 2 topics
ldana = models.LdaModel(corpus=corpusna, num_topics=6, id2word=id2wordna, passes=100)
ldana.print_topics()

# Identify topics in each class

In [None]:
# Let's take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
#list(zip([a for [(a,b)] in corpus_transformed], data_dtm.index))
try:
    listt = list(zip(corpus_transformed, data_dtm.index))
except IndexError:
    pass

In [None]:
for i in range(0, len(listt)):
    list_of_topics = []
    for j in range(0, len(listt[i][0])):
        list_of_topics.append(listt[i][0][j][0])
    print('Class:' , listt[i][1], 'Topic(s):', list_of_topics)

# Collect category labels

Done with the english dataset

In [None]:
# df = df.drop(columns=['text'])
# df.head()
df_cat = pd.read_csv('data/categories.csv') #dataset with categories
df_cat = df_cat.dropna()
df_cat = df_cat.drop(columns=['Unnamed: 0'])
df_cat.head()

## Complete the dataset - getting the categories###

In [None]:
# import requests
# from dateutil import parser

# pageids = df.Id.tolist()
# categories = []

# sess = requests.Session()

# URL = "https://en.wikipedia.org/w/api.php"

# for pageid in pageids:

#     PARAMS = {
#         "action": "query",
#         "format": "json",
#         "prop": "categories",
#         "redirects": "1",
#         "cllimit": "max",
#         "pageids": pageid
#     }

#     res = sess.get(url=URL, params=PARAMS)
#     data = res.json()
#     cats = []
#     for category in data['query']['pages'][str(pageid)]['categories']:
#         cat = category['title'].lower()
#         cat = cat.replace('category:', '')
#         cat = cat.replace('-', ' ')
#         reject = False
#         reject += 'articles' in cat.lower()
#         reject += 'wiki' in cat.lower()
#         reject += 'pages' in cat.lower()
#         reject += 'cs1' in cat.lower()   
#         reject += 'template' in cat.lower()
#         try:
#             parser.parse(cat, fuzzy=True)
#             reject += True
#         except ValueError:
#             reject += False
#         if reject == False:
#             cats.append(cat)
#     categories.append(cats)

# df.tocsv('categories.csv')

In [None]:
categories1 = []
categories = df_cat.categories.tolist()

for cats in categories:
    cc = []
    for cat in cats:
        cc.append(nouns_adj(cat))
    categories1.append(cc)
categories1 = [', '.join(item) for item in categories1]
df_cat['categories'] = categories1

In [None]:
categories

In [None]:
# import collections
# all_categories = [item for sublist in categories for item in sublist]
# counter=collections.Counter(all_categories)
# common_categories = [a[0] for a in counter.most_common(3000)]
# common_categories[:100]

In [None]:
df['categories'] = pd.Series(categories, index=df.index)