# `Reading .csv file from web (gitHub) link`

In [0]:

import pandas as pd

#Read csv file directly from github
data = pd.read_csv("https://github.com/krus7ev/191024_data_processing_workshop/raw/master/191021_sentiment_test_csv_teXprocessed.csv")

print("\nDATA COLUMNS:")
print(data.columns)

print("\nDATA HEADER:")
print(data.head())

print("\nDATA DESCRIPTION:")
print(data.describe())

print("\nDATA SHAPE: ", data.shape)

# `Uploading (.xlsx) files`

In [0]:
from google.colab import files
uploaded = files.upload()

# `Reading .xlsx files filtering and exporting back to .csv`


In [0]:
import openpyxl as oxl
import pandas   as pd
import numpy    as np
import io

wb_data = oxl.load_workbook(io.BytesIO(uploaded['191022_propaganda_task1_train_truncated_11986.xlsx']))
print("\n", wb_data.sheetnames)

dataSheet = wb_data['191022_task1_train_truncated_11986']

cell_names = []
for cell in dataSheet[1] :
    cell_names.append(cell.value) 
print("\n", cell_names)

dataFrame = pd.DataFrame(dataSheet.values)
print(dataFrame.head)

# Convert to numpy
dataNumpy = dataFrame.get_values()

# Extract propaganda and non-prpopaganda annotations for separate analysis
dataNumpyPropaganda    = dataNumpy[np.where(dataNumpy[:,2] == 'propaganda')]
dataNumpyNonPropaganda = dataNumpy[np.where(dataNumpy[:,2] == 'non-propaganda')]

print("\nFirst propaganda sample     :\n", dataNumpyPropaganda[0])
print("\nFirst non-propaganda sample :\n", dataNumpyNonPropaganda[0])

print("\n"+"dataNumpyPropaganda    SHAPE : ", dataNumpyPropaganda.shape)
print("\n"+"dataNumpyNonPropaganda SHAPE : ", dataNumpyNonPropaganda.shape)

# Translate pandas data frame
dataFramePropaganda    = pd.DataFrame(dataNumpyPropaganda,    columns = cell_names)
dataFrameNonPropaganda = pd.DataFrame(dataNumpyNonPropaganda, columns = cell_names)

# `Text data cleaning and pre-processing`

In [0]:
def clean_text(text):
    
    # Remove special chars and punctuation
    text = " ".join(tokenizer(text))
    
    # lowcase
    text = text.lower()
    text = text.split(" ")
    
    # Remove stopwords
    text = [word for word in text if not word in stop_words]
    
    # Lematize
    #text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    
    # Lematize - "v"? - verb forms...?
    #text = [lemmatizer.lemmatize(token, "v") for token in text]
    
    text = " ".join(text)
    
    return text

In [0]:
import re 

def cleanTexts(texts_list, filters='!"#$%&()*+,-./:;0123456789<=>?@[\\]^_`{|}~\t\n', split=" ") :
    texts_clear = []

    for text in texts_list :
        translate_dict = dict((c, split) for c in filters)
        translate_map  = str.maketrans(translate_dict)
        text_clear     = text.translate(translate_map)

        text_clear = re.sub(r'[\s]+', ' ', text_clear)

        texts_clear   += [text_clear]

    return texts_clear

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem                       import WordNetLemmatizer
from nltk.corpus                     import stopwords
from sklearn.feature_extraction.text import CountVectorizer

print("\n"+"dataFramePropaganda    SHAPE : ", dataFramePropaganda.shape)
print("\n"+"dataFrameNonPropaganda SHAPE : ", dataFrameNonPropaganda.shape)


#lemmatizer = WordNetLemmatizer()

stop_words = stopwords.words("english")

cvec = CountVectorizer()
tokenizer = cvec.build_tokenizer()

dataFramePropaganda['article_prep']    = dataFramePropaganda['article'].apply(clean_text)
dataFrameNonPropaganda['article_prep'] = dataFrameNonPropaganda['article'].apply(clean_text)

print(dataFrameNonPropaganda['article_prep'].loc[0])

# `Data exploration`

In [0]:
from wordcloud   import WordCloud

propaganda_texts     = dataFramePropaganda['article_prep'].get_values()
non_propaganda_texts = dataFrameNonPropaganda['article_prep'].get_values()

print(propaganda_texts.shape)
print(non_propaganda_texts.shape)

propaganda_texts     = cleanTexts(propaganda_texts.tolist())
non_propaganda_texts = cleanTexts(non_propaganda_texts.tolist())


# Join the different processed titles together.
flat_propaganda_texts     = ' '.join(propaganda_texts)
flat_non_propaganda_texts = ' '.join(non_propaganda_texts)

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=500, contour_width=10, contour_color='steelblue', width=800, height=400)



In [0]:
# Generate a word cloud
wordcloud.generate(flat_propaganda_texts)
# Visualize the word cloud
wordcloud.to_image()

In [0]:
# Generate a word cloud
wordcloud.generate(flat_non_propaganda_texts)
# Visualize the word cloud
wordcloud.to_image()

# `Applying unsupervised analysis`

In [0]:
def plot_10_most_common_words(count_data, count_vectorizer):
    import matplotlib.pyplot as plt
    words = count_vectorizer.get_feature_names()
    total_counts = np.zeros(len(words))
    for t in count_data:
        total_counts+=t.toarray()[0]
    
    count_dict = (zip(words, total_counts))
    count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
    words = [w[0] for w in count_dict]
    counts = [w[1] for w in count_dict]
    x_pos = np.arange(len(words)) 
    
    plt.figure(2, figsize=(15, 15/1.6180))
    plt.subplot(title='10 most common words')
    sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
    sns.barplot(x_pos, counts, palette='husl')
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

`Top 10 Propaganda terms`

In [0]:
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('dark_background')
import seaborn as sns
#%matplotlib inline

#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialise a count vectorizer with the English stop words
count_vectorizer_PTs = CountVectorizer(max_features  = 23000,
                                       stop_words    ='english',
                                       token_pattern = r"(?u)\b\w+\b",
                                       ngram_range   = (1,3),
                                       min_df        = 0.005,
                                       binary        = False)

# Fit and transform the processed titles
cv_propaganda_texts = count_vectorizer_PTs.fit_transform(propaganda_texts)


# Initialise a term-frequency/inverse-doc-freq vectorizer with the English stop words
tfidf_vectorizer_PTs = TfidfVectorizer(max_features  = 23000,
                                       stop_words    ='english',
                                       token_pattern = r"(?u)\b\w+\b",
                                       ngram_range   = (1,3),
                                       min_df        = 0.005,
                                       use_idf       = False)

tfidf_vectorizer_PTs.fit(propaganda_texts)
cv_tfidf_propaganda_texts = tfidf_vectorizer_PTs.transform(propaganda_texts)


# Visualise the 10 most common words
plot_10_most_common_words(cv_tfidf_propaganda_texts, tfidf_vectorizer_PTs)

`Top 10 Non-propaganda terms`

In [0]:
# Initialise a count vectorizer with the English stop-words
count_vectorizer_NPTs = CountVectorizer(max_features  = 23000,
                                        stop_words    ='english',
                                        token_pattern = r"(?u)\b\w+\b",
                                        ngram_range   = (1,3),
                                        min_df        = 0.005,
                                        binary        = False)

# Fit-transform the processed texts
cv_non_propaganda_texts = count_vectorizer_NPTs.fit_transform(non_propaganda_texts)


# Initialise a tfidf vectorizer as well
tfidf_vectorizer_NPTs = TfidfVectorizer(max_features  = 23000,
                                        stop_words    ='english',
                                        token_pattern = r"(?u)\b\w+\b",
                                        ngram_range   = (1,3),
                                        min_df        = 0.005,
                                        use_idf       = True)

# Fit and transform the processed texts
tfidf_vectorizer_NPTs.fit(non_propaganda_texts)
cv_tfidf_non_propaganda_texts = tfidf_vectorizer_NPTs.transform(non_propaganda_texts)


# Visualise the 10 most common words
plot_10_most_common_words(cv_tfidf_non_propaganda_texts, tfidf_vectorizer_NPTs)

# `Basic count-based clustering: K-means`

In [0]:
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(cv_tfidf_propaganda_texts)


print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer_PTs.get_feature_names()
for i in range(true_k):
    print("\nCluster %d:" % (i+1)),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")
Y = tfidf_vectorizer_PTs.transform(["High court judge ruled out Kavanaugh pleading guilty"])
prediction = model.predict(Y)
print(prediction)


# `Latent Dirichlet Аllocation for Topic modelling`

In [0]:
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

In [0]:
import warnings
warnings.simplefilter("ignore", DeprecationWarning)

# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
        
# Tweak the two parameters below
number_topics = 5
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)

lda.fit(cv_propaganda_texts)

# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer_PTs, number_words)

# `Visualise LDA`

In [0]:
!pip install pyLDAvis

from   pyLDAvis import sklearn as sklearn_lda
import pyLDAvis
import pickle 
from   datetime import datetime

timestamp_lda = datetime.now().strftime("%y%m%d_%H%Mh")
LDAvis_data_filepath = (timestamp_lda +'_LDA_vis_prep_'+ str(number_topics) + '_topics.pkl' )
LDAvis_prep_filepath = '191023_1150h_LDA_vis_prep_5_topics.pkl'
PREP_VIS = False


# This is a bit time consuming 
# - make the if statement True if you want to execute visualization prep yourself
if PREP_VIS is True :
    LDAvis_prepared = sklearn_lda.prepare(lda, cv_propaganda_texts, count_vectorizer_PTs)
    
    with open(LDAvis_data_filepath, 'wb') as outf:
        pickle.dump(LDAvis_prepared, outf)
        outf.close()
    
# We can use the pre-prepared pyLDAvis file here to load the data from disk
else :
    with open(LDAvis_prep_filepath, 'rb') as f :
        LDAvis_prepared = pickle.load(f)


pyLDAvis.save_html(LDAvis_prepared, timestamp_lda +'_LDA_vis_'+ str(number_topics) + '_topics.html')


# `Vectorize full dataset in compact scipy representation`

In [0]:
from scipy.sparse            import csr_matrix
from sklearn.model_selection import train_test_split

# Use same dataframe-numpy-list pattern again for all texts and labels (repeat same steps with some new)
dataFrameAllTexts = pd.DataFrame(dataNumpy[1:], columns = cell_names)
dataFrameAllTexts['article_prep'] = dataFrameAllTexts['article'].apply(clean_text)
dataFrameAllTexts['target']       = dataFrameAllTexts['label'].map({'propaganda': 1, 'non-propaganda': 0})

print(dataNumpy[0])
print(dataFrameAllTexts['label'].loc[23])
print(dataFrameAllTexts['target'].loc[23])

all_texts = dataFrameAllTexts['article_prep'].get_values()
all_texts = cleanTexts(all_texts.tolist())

# Shuffle and split dataset into train and test sets, stratifyig according to its class distribution
df_train_data, df_test_data = train_test_split(dataFrameAllTexts, test_size = 0.25, random_state = 42, stratify=dataFrameAllTexts['target'])

# Extract train texts and labels
train_texts  = df_train_data['article_prep'].get_values()
train_texts  = cleanTexts(train_texts.tolist())
train_labels = df_train_data['target'].get_values()


# Initialise a count vectorizer with the English stop-words
count_vectorizer_all = CountVectorizer(max_features  = 23000,
                                       stop_words    ='english',
                                       token_pattern = r"(?u)\b\w+\b",
                                       ngram_range   = (1,3),
                                       min_df        = 0.005,
                                       binary        = False)

# Fit vectorizer on all of the processed texts
count_vectorizer_all.fit(all_texts)


# Vectorize train set
cv_train_texts = count_vectorizer_all.transform(train_texts)
# Convert to dense representation for csr matrix encoding
cv_train_texts_dense = cv_train_texts.todense()


# Print out train dataset info
all_feature_names = count_vectorizer_all.get_feature_names()
print('\nTOTAL unique ngram-tokens found in texts: %s.\n' % len(all_feature_names))
print('Shape of BOW data mat:', cv_train_texts_dense.shape)


# Conert to CSR standard
train_texts_csr = csr_matrix(cv_train_texts_dense)
train_labels_csr   = [str(i) for i in train_labels]


# Visualise the 10 most common words
plot_10_most_common_words(cv_train_texts, count_vectorizer_all)


##########################################################
test_texts  = df_test_data['article_prep'].get_values()
test_texts  = cleanTexts(test_texts.tolist())
test_labels = df_test_data['target'].get_values()

# Vectorize test set too
cv_test_texts = count_vectorizer_all.transform(test_texts)
# Convert to dense representation for csr matrix encoding
cv_test_texts_dense = cv_test_texts.todense()

test_texts_csr = csr_matrix(cv_test_texts_dense)
test_labels_csr   = [str(i) for i in test_labels]

# `Train supervised text classifiers with sklearn`

`Decision Tree`

In [0]:
from   sklearn.tree                 import DecisionTreeClassifier
from   sklearn.model_selection      import GridSearchCV
from   sklearn.metrics              import confusion_matrix
from   datetime                     import datetime

print("\n===============================================")
print("  Training classic best-split Decision Tree\n")

DecTree = DecisionTreeClassifier()

DecTree_params = {'criterion'         : ('gini', 'entropy'),
                  'min_samples_split' : (2, 4, 8, 16, 32)}
DecTree_search = GridSearchCV(DecTree, DecTree_params, n_jobs=1)

DecTree_search.fit(train_texts_csr, train_labels_csr)
print("DecTree search best score  : ", DecTree_search.best_score_)
print("DecTree search best params : ", DecTree_search.best_params_)

# Store model for future prediction
DecTree_best = DecTree_search.best_estimator_

outf = open(datetime.now().strftime("%y%m%d_%H%Mh") + "_DecTree.pkl", "wb")
pickle.dump(DecTree_best, outf)
outf.close()

DecTree_predictions = DecTree_best.predict(test_texts_csr)
print("First prediction output    : ", DecTree_predictions[0])

DecTree_precision = np.mean(DecTree_predictions == test_labels_csr)
print("Test accuracy score        : ", DecTree_precision)

cm_raw = confusion_matrix(test_labels_csr, DecTree_predictions)
print('\nConfusion matrix, without normalization...')
print(cm_raw)

#Normalize the confusion matrix by row (i.e by the number of samples in each class)
print('\nNormalized confusion matrix....')
cm_norm = cm_raw.astype('float') / cm_raw.sum(axis=1)[:, np.newaxis]
print(cm_norm)


`Naive Bayes Classifier`

In [0]:
from   sklearn.naive_bayes          import MultinomialNB
from   sklearn.model_selection      import GridSearchCV
from   sklearn.metrics              import confusion_matrix
from   datetime                     import datetime

print("\n===============================================")
print("  Training Multinomial Naive Bayes Classifier\n")

MNB = MultinomialNB()

MNB_params = {'alpha' : (1.0, 1e-1, 1e-2, 1e-3)}
MNB_search = GridSearchCV(MNB, MNB_params, n_jobs=1) #, cv=5) #for k-folds

MNB_search.fit(train_texts_csr, train_labels_csr)
print("DecTree search best score  : ", MNB_search.best_score_)
print("DecTree search best params : ", MNB_search.best_params_)

# Store model for future prediction
MNB_best = MNB_search.best_estimator_

outf = open(datetime.now().strftime("%y%m%d_%H%Mh") + "_MNB.pkl", "wb")
pickle.dump(MNB_best, outf)
outf.close()

MNB_predictions = MNB_best.predict(test_texts_csr)
print("First prediction output    : ", MNB_predictions[0])

DecTree_precision = np.mean(MNB_predictions == test_labels_csr)
print("Test accuracy score        : ", DecTree_precision)

cm_raw = confusion_matrix(test_labels_csr, MNB_predictions)
print('\nConfusion matrix, without normalization...')
print(cm_raw)

#Normalize the confusion matrix by row (i.e by the number of samples in each class)
print('\nNormalized confusion matrix....')
cm_norm = cm_raw.astype('float') / cm_raw.sum(axis=1)[:, np.newaxis]
print(cm_norm)

`Support Vector Machine`

In [0]:
from   sklearn.svm                  import SVC
from   sklearn.model_selection      import GridSearchCV
from   sklearn.metrics              import confusion_matrix
from   datetime                     import datetime

print("\n===============================================")
print("  Training Support Vector Machine (SVC)\n")

SVC_SVM = SVC()

SVC_params = {'C'           : (1.0, 0.1, 0.01, 0.001), #fix with range
              'kernel'      : ('rbf', 'linear', 'sigmoid'),
              'degree'      : (2, 3, 4)}

SVC_search = GridSearchCV(SVC_SVM, SVC_params, n_jobs=1) #, cv=Settings.kFolds)

SVC_search.fit(train_texts_csr, train_labels_csr)
print("SVC search best score: ",  SVC_search.best_score_)
print("SVC search best params: ", SVC_search.best_params_)

# Store model for future prediction
SVC_best = SVC_search.best_estimator_

outf = open(datetime.now().strftime("%y%m%d_%H%Mh") + "_MNB.pkl", "wb")
pickle.dump(SVC_best, outf)
outf.close()

SVC_predictions = SVC_best.predict(test_texts_csr)
print("First prediction output    : ", SVC_predictions[0])

SVC_precision = np.mean(SVC_predictions == test_labels_csr)
print("Test accuracy score        : ", SVC_precision)

cm_raw = confusion_matrix(test_labels_csr, SVC_predictions)
print('\nConfusion matrix, without normalization...')
print(cm_raw)

#Normalize the confusion matrix by row (i.e by the number of samples in each class)
print('\nNormalized confusion matrix....')
cm_norm = cm_raw.astype('float') / cm_raw.sum(axis=1)[:, np.newaxis]
print(cm_norm)


  Training Multinomial Naive Bayes Classifier





SVC search best score:  0.913116030704194
SVC search best params:  {'C': 0.01, 'degree': 2, 'kernel': 'linear'}
First prediction output    :  0
Test accuracy score        :  0.9182515849182515

Confusion matrix, without normalization...
[[1898   94]
 [ 151  854]]

Normalized confusion matrix....
[[0.95281124 0.04718876]
 [0.15024876 0.84975124]]


# `Extracting web content with Scrapy`

In [0]:
!pip install scrapy
!pip install langdetect
!mkdir crawler_data

In [0]:
import pandas as pd
import re
import scrapy

from scrapy.crawler import CrawlerProcess
from langdetect     import detect 
from google.colab   import drive

html_tags = re.compile('<.*?>')


class AlbertEinsteinQuotes(scrapy.Spider):
    name = "AlbertEinsteinQuotes"
    start_urls = [
        'https://en.wikiquote.org/wiki/Albert_Einstein',
    ]
    
    def stripHTML(self, row):
        return re.sub(html_tags, '', row)
    
    def downloadDF(self, df, dfName):
        df.to_csv("crawled_data/" + dfName + '.csv')

    
    def parse(self, response):
        columns = ["text"]
        values_en, values_de, values_fr = ([] for i in range(3))
        for quote in response.css('div.mw-parser-output > ul > li'):
            row = self.stripHTML(quote.extract())
            text = row.split("\n")[0]
            if detect(text) == 'en':
                values_en.append(text)
            elif detect(text) == 'de':
                values_de.append(text)
            elif detect(text) == 'fr':
                values_fr.append(text)
                
        enQuotesDF = pd.DataFrame(values_en, columns=columns)
        print("English Dataframe Contens ", enQuotesDF[:10], sep='\n')    
        self.downloadDF(enQuotesDF, "Albert_Einstein_English_Quotes")
        
        deQuotesDF = pd.DataFrame(values_de, columns=columns)
        print("German Dataframe Contens ", deQuotesDF[:10], sep='\n')    
        self.downloadDF(enQuotesDF, "Albert_Einstein_German_Quotes")

        
        frQuotesDF = pd.DataFrame(values_fr, columns=columns)
        print("French Dataframe Contens ", frQuotesDF[:10], sep='\n')    
        self.downloadDF(enQuotesDF, "Albert_Einstein_French_Quotes")

process = CrawlerProcess()
process.crawl(AlbertEinsteinQuotes)
process.start()


# `Download files to local system`

In [0]:
from google.colab import files

with open('example.txt', 'w') as f:
  
  f.write('some content')

files.download('example.txt')

# `Mount Google Drive`

In [0]:
from google.colab import drive
drive.mount('/content/drive')