In [0]:
#Install libraries
#pip install langdetect
#pip install spacy
!python3 -m spacy download es_core_news_sm
!python3 -m spacy download en_core_web_sm

In [0]:
#Load libraries
from google.colab import drive #to use Google Drive
import time
import pickle
import pandas as pd
import numpy as np
import nltk #for NLP
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, sent_tokenize
import re #regular expressions
import spacy
import es_core_news_sm
import en_core_web_sm
from langdetect import detect
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

import collections as coll
import math
import string
import matplotlib.pyplot as plt
from matplotlib import style

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from lightgbm import LGBMClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('cmudict')

#mount google drive
drive.mount('/gdrive')

In [0]:
####### FUNCTIONS FOR DATA PREPARATION

# Standardize separators
def standard_sep(text,sep):
    try:
      punctuation = {'\r\n','\n','\t',' and ',', ','contributor',str(sep+'and '),' (',' & ',' | ','- '}
      for sign in punctuation:
        text = text.lower().replace(' ',' ').replace('  ',' ').replace('  ',' ').replace('  ',' ').replace('  ',' ') #corrects space sinonymia
        text = text.replace(sign, sep) #replaces signs
        text = text.replace(str(sep+sep), sep).replace(str(sep+sep), sep).replace(str(sep+sep), sep).replace(str(sep+sep), sep).replace(str(sep+sep), sep) #removes duplicated separators            
      return text
    except:
        return None

# Remove punctuation
def remove_punctuation(text):
    try:
        punctuation = {'/', '"', '(', ')', '.', ',', '%', ';', '?', '¿', '!', '¡',
                       ':', '#', '$', '&', '>', '<', '-', '_', '°', '|', '¬', '\\', '*', '+',
                       '[', ']', '{', '}', '=', "'", '@','‘','’','—','«','»','“','”'}
        for sign in punctuation:
            text = text.replace(sign, ' ') #replaces signs
        text = text.replace('  ', ' ').replace('  ', ' ').replace('  ', ' ') #removes duplicated spaces
        
        return text
    except:
        return None

# Cleaning text function
def clean_text(text, lang):
  try:
    outputtext=text.lower() #converts to lowercase
    outputtext=re.sub(" ?(f|ht)(tp)(s?)(://)(.*)[.|/](.*)", "", outputtext) #removes URLs
    outputtext = remove_punctuation(outputtext) #removes punctuation
    outputtext = ' '.join([word for word in outputtext.split() if word not in stopwords.words(lang)]) #removes stopwords
    return outputtext
  except:
    return None

# Stemming function

def stem_text(text, lang):
    try:
      stemmer = SnowballStemmer(lang)
      stemmed_list = []
      for word in text.split():
        stemmed_word = stemmer.stem(word)
        stemmed_list.append(stemmed_word)
      outputtext = ' '.join([w for w in stemmed_list]) #removes stopwords
      return outputtext
    except:
      return None

# Lemmatization function
def lemma_text(text, lang):
  try:
    if lang == 'spanish': nlp = es_core_news_sm.load()
    if lang == 'english': nlp = en_core_web_sm.load()
    
    doc = nlp(text)
    lemmas = [tok.lemma_.lower() for tok in doc]
    outputtext = ' '.join([w for w in lemmas]) #removes stopwords
    return outputtext
  except:
      return None

# Laguage detection
def detect_lang(text, default_lang):
  try:
    output = detect(doc)
  except:
    output = default_lang
  if output == 'es': output = 'spanish'
  if output == 'en': output = 'english'
  return output

# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------
####### FUNCTIONS FOR TEXT CLUSTERING

# Best LDA model
def best_LDA(search_params, documents):
  # vectorize data
  data_vectorized = CountVectorizer(max_df=0.95, min_df=2).fit_transform(documents)
  # Init Grid Search Class
  model = GridSearchCV(LatentDirichletAllocation(), param_grid=search_params)
  # Do the Grid Search
  model.fit(data_vectorized)
  # Model Parameters
  best_model_params = model.best_params_
  # Log Likelihood Score
  best_model_score =  model.best_score_
  # Perplexity
  best_model_perplexity =  model.best_estimator_.perplexity(data_vectorized)
  return best_model_params, best_model_score, best_model_perplexity

# Clean strings and returning an array of ngrams
def ngrams_analyzer(string, N = 3):
    string = re.sub(r'[,-./]', r'', string)
    ngrams = zip(*[string[i:] for i in range(N)])  # N-Gram length is N
    return [''.join(ngram) for ngram in ngrams]

# Calculates TF IDF matrix and features
def tf_idf_transformer(documents):
  # Calculate term frequency matrix
  vectorizer = CountVectorizer(max_df=0.95, min_df=2)
  #vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
  tf_idf = vectorizer.fit_transform(documents)
  # Obtain feature names
  feature_names = vectorizer.get_feature_names()
  return tf_idf, feature_names

# Create LDA model
def LDA_model(documents, n_topics, n_words):
  # Get TF-IDF matrix and feature names
  tf_idf, feature_names = tf_idf_transformer(documents)
  # Run LDA
  lda_model = LatentDirichletAllocation(n_components = n_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf_idf)
  topic_w = lda_model.transform(tf_idf) #weights for each topic
  word_w = lda_model.components_ #weights for each word in tf matrix
  # Extract most frequent topic for each document
  mft = []
  for weights in topic_w.tolist():
    mft.append(weights.index(max(weights)))
  # Return most frequent topics
  topics = []
  topic_words = []
  for topic_idx, topic in enumerate(word_w):
    words = " ".join([feature_names[i] for i in topic.argsort()[:-n_words - 1:-1]]) # calculates most frequent words per each topic
    topics.append(topic_idx)
    topic_words.append(words)
  mfw = pd.DataFrame(list(zip(topics, topic_words)), columns =['most_freq_topic', 'most_freq_words'])
  return mft, mfw

# create kMeans model
def kMeans_ElbowMethod(data_as_list, kmin, kmax):
    X = data_as_list  # <your_data>
    distorsions = []
    for k in range(kmin, kmax):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(X)
        distorsions.append(kmeans.inertia_)

    fig = plt.figure(figsize=(15, 5))
    plt.plot(range(kmin, kmax), distorsions, 'bo-')
    plt.grid(True)
    plt.ylabel("Square Root Error")
    plt.xlabel("Number of Clusters")
    plt.title('Elbow curve')
    #plt.savefig("ElbowCurve.png")
    plt.show()

def PCA_transformer(numeric_data, c_level):
  np_data = (np.array(numeric_data))
	# mean normalization of the data . converting into normal distribution having mean=0 , -0.1<x<0.1
  sc = StandardScaler()
  x = sc.fit_transform(np_data)
  # setting number of components
  pca = PCA().fit(x)
  # set number of components for PCA
  csum = np.cumsum(pca.explained_variance_ratio_)
  nc=0
  for c in csum:
    nc = nc+1
    if c > 0.9: break
  # find principal components
  pca = PCA(n_components=nc)
  components = (pca.fit_transform(x))
  return components

def kMeans_clusters(numeric_data, K):
    # reduce dimensionality with PCA
    components = PCA_transformer(numeric_data, c_level = .9)
	# Apply kmeans algorithm
    kmeans = KMeans(n_clusters=K, n_jobs=-1)
    kmeans.fit_transform(components)
    labels = kmeans.labels_
    return labels

def kMeans_topics(documents, n_topics):
  # Calculate TF-IDF matrix
  vectorizer = TfidfVectorizer(analyzer=ngrams_analyzer)
  tf_idf = vectorizer.fit_transform(documents)
  # Run k-Means
  kmeans = KMeans(n_clusters=n_topics).fit(tf_idf)
  # Obtain clusters
  clusters = kmeans.predict(vectorizer.transform(documents)).tolist()
  return clusters
# -----------------------------------------------------------------------------------------
# -----------------------------------------------------------------------------------------
####### FUNCTIONS FOR STYLOMETRIC FEATURE EXTRACTION

# returns a feature vector of text
def FeatureExtraction(texts, winSize, step):
    # cmu dictionary for syllables
    global cmuDictionary
    cmuDictionary = cmudict.dict()
    vector = []
    for doc in texts:
        feature = []
        tokens = word_tokenize(doc.lower(), language='english') #tokenize words
        sent_tokens = sent_tokenize(doc, language='english') #tokenize sentences
        words_wosp = RemoveStopwords(tokens, language='english') #remove stopwords
        words_woschs = RemoveSpecialCHs(tokens) # remove special characters
        len_words_woschs = len(words_woschs)
        
        # LEXICAL FEATURES
        meanwl = (Avg_wordLength(words_wosp))
        feature.append(meanwl)

        meansl = (Avg_SentLenghtByCh(sent_tokens))
        feature.append(meansl)

        mean = (Avg_SentLenghtByWord(sent_tokens))
        feature.append(mean)

        meanSyllable = Avg_Syllable_per_Word(words_wosp)
        feature.append(meanSyllable)

        means = CountSpecialCharacter(doc)
        feature.append(means)

        p = CountPunctuation(doc)
        feature.append(p)

        f = CountFunctionalWords(words_woschs, len_words_woschs)
        feature.append(f)

        # VOCABULARY RICHNESS FEATURES

        TTratio = typeTokenRatio(tokens)
        feature.append(TTratio)

        HonoreMeasureR, hapax = hapaxLegemena(words_woschs, len_words_woschs)
        feature.append(hapax)
        feature.append(HonoreMeasureR)

        SichelesMeasureS, dihapax = hapaxDisLegemena(words_woschs, len_words_woschs)
        feature.append(dihapax)
        feature.append(SichelesMeasureS)

        YuleK = YulesCharacteristicK(words_woschs, len_words_woschs)
        feature.append(YuleK)

        S = SimpsonsIndex(words_woschs, len_words_woschs)
        feature.append(S)

        B = BrunetsMeasureW(words_woschs, len_words_woschs)
        feature.append(B)

        Shannon = ShannonEntropy(words_woschs, len_words_woschs)
        feature.append(Shannon)

        # READIBILTY FEATURES
        FR = FleschReadingEase(words_woschs, len_words_woschs, winSize)
        feature.append(FR)

        FC = FleschCincadeGradeLevel(words_woschs, len_words_woschs, winSize)
        feature.append(FC)

        D = dale_chall_readability_formula(words_woschs, len_words_woschs, winSize)
        feature.append(D)

        G = GunningFoxIndex(words_woschs, len_words_woschs, winSize)
        feature.append(G)

        vector.append(feature)

    return vector

# ------------------------------------------------------------------
def RemoveStopwords(tokens, language='english'):
    st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
    stop = stopwords.words(language) + st
    words_wosp = [word for word in tokens if word not in stop]
    return words_wosp
# ------------------------------------------------------------------------
def RemoveSpecialCHs(tokens):
    st = [",", ".", "'", "!", '"', "#", "$", "%", "&", "(", ")", "*", "+", "-", ".", "/", ":", ";", "<", "=", '>', "?",
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
    words_woschs = [word for word in tokens if word not in st]
    return words_woschs
# ---------------------------------------------------------------------
def syllable_count_Manual(word):
  try:
    word = word.lower()
    count = 0
    vowels = "aeiouy"
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
            if word.endswith("e"):
                count -= 1
    if count == 0:
        count += 1
    return count
  except:
    return -1
# ---------------------------------------------------------------------
# COUNTS NUMBER OF SYLLABLES
def syllable_count(word):
    global cmuDictionary
    d = cmuDictionary
    try:
        syl = [len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]][0]
    except:
        syl = syllable_count_Manual(word)
    return syl
    # ----------------------------------------------------------------------------
# removing stop words plus punctuation.
def Avg_wordLength(words_wosp):
  try:
    awl = np.average([len(word) for word in words_wosp])
    return awl
  except:
    return -1
# ----------------------------------------------------------------------------
# returns avg number of characters in a sentence
def Avg_SentLenghtByCh(sent_tokens):
  try:
    aslbch = np.average([len(token) for token in sent_tokens])
    return aslbch
  except:
    return -1
# ----------------------------------------------------------------------------
# returns avg number of words in a sentence
def Avg_SentLenghtByWord(sent_tokens):
  try:
    aslbw = np.average([len(token.split()) for token in sent_tokens])
    return aslbw
  except:
    return -1
# ----------------------------------------------------------------------------
# GIVES NUMBER OF SYLLABLES PER WORD
def Avg_Syllable_per_Word(words_wosp):
  try:
    syllabls = [syllable_count(word) for word in words_wosp]
    p = (" ".join(words_wosp))
    aspw = sum(syllabls) / max(1, len(words_wosp))
    return aspw
  except:
    return -1
# -----------------------------------------------------------------------------
# COUNTS SPECIAL CHARACTERS NORMALIZED OVER LENGTH OF doc
def CountSpecialCharacter(text):
  try:
    st = ["#", "$", "%", "&", "(", ")", "*", "+", "-", "/", "<", "=", '>',
          "@", "[", "\\", "]", "^", "_", '`', "{", "|", "}", '~', '\t', '\n']
    count = 0
    for i in text:
        if (i in st):
            count = count + 1
    sch = count / len(text)
    return sch
  except:
    return -1

# ----------------------------------------------------------------------------
def CountPunctuation(text):
  try:
    st = [",", ".", "'", "!", '"', ";", "?", ":", ";"]
    count = 0
    for i in text:
        if (i in st):
            count = count + 1
    cp = float(count) / float(len(text))
    return cp
  except:
    return -1

# ----------------------------------------------------------------------------
# RETURNS NORMALIZED COUNT OF FUNCTIONAL WORDS FROM A Framework for
# Authorship Identification of Online Messages: Writing-Style Features and Classification Techniques
def CountFunctionalWords(words_woschs, len_words_woschs):
  try:
    functional_words = """a between in nor some upon
    about both including nothing somebody us
    above but inside of someone used
    after by into off something via
    all can is on such we
    although cos it once than what
    am do its one that whatever
    among down latter onto the when
    an each less opposite their where
    and either like or them whether
    another enough little our these which
    any every lots outside they while
    anybody everybody many over this who
    anyone everyone me own those whoever
    anything everything more past though whom
    are few most per through whose
    around following much plenty till will
    as for must plus to with
    at from my regarding toward within
    be have near same towards without
    because he need several under worth
    before her neither she unless would
    behind him no should unlike yes
    below i nobody since until you
    beside if none so up your
    """
    functional_words = functional_words.split()
    count = 0
    for i in words_woschs:
        if i in functional_words:
            count += 1
    fw = count / len_words_woschs
    return fw
  except:
    return -1
# ---------------------------------------------------------------------------
# also returns Honore Measure R
def hapaxLegemena(words_woschs, N):
  try:
    V1 = 0
    # dictionary comprehension . har word kay against value 0 kardi
    freqs = {key: 0 for key in words_woschs}
    for word in words_woschs:
        freqs[word] += 1
    for word in freqs:
        if freqs[word] == 1:
            V1 += 1
    V = float(len(set(words_woschs)))
    R = 100 * math.log(N) / max(1, (1 - (V1 / V)))
    h = V1 / N
    return R, h
  except:
    return -1, -1
# ---------------------------------------------------------------------------
def hapaxDisLegemena(words_woschs, len_words_woschs):
  try:
    count = 0
    # Collections as coll Counter takes an iterable collapse duplicate and counts as
    # a dictionary how many equivelant items has been entered
    freqs = coll.Counter()
    freqs.update(words_woschs)
    for word in freqs:
        if freqs[word] == 2:
            count += 1

    h = count / float(len_words_woschs)
    S = count / float(len(set(words_woschs)))
    return S, h
  except:
    return -1,-1
# ---------------------------------------------------------------------------
# c(w)  = ceil (log2 (f(w*)/f(w))) f(w*) frequency of most commonly used words f(w) frequency of word w
# measure of vocabulary richness and connected to zipfs law, f(w*) const rak kay zips law say rank nikal rahay hein
def AvgWordFrequencyClass(words_woschs):
  try:
    # dictionary comprehension . har word kay against value 0 kardi
    freqs = {key: 0 for key in words_woschs}
    for word in words_woschs:
        freqs[word] += 1
    maximum = float(max(list(freqs.values())))
    awfc = np.average([math.floor(math.log((maximum + 1) / (freqs[word]) + 1, 2)) for word in words_woschs])
    return awfc
  except:
    return -1
# --------------------------------------------------------------------------
# TYPE TOKEN RATIO NO OF DIFFERENT WORDS / NO OF WORDS
def typeTokenRatio(tokens):
  try:
    ttr = len(set(tokens)) / len(tokens)
    return ttr
  except:
    return -1
# --------------------------------------------------------------------------
# logW = V-a/log(N)
# N = total words , V = vocabulary richness (unique words) ,  a=0.17
# we can convert into log because we are only comparing different texts
def BrunetsMeasureW(words_woschs, N):
  try:
    a = 0.17
    V = float(len(set(words_woschs)))
    B = (V - a) / (math.log(N))
    return B
  except:
    return -1
# -------------------------------------------------------------------------
# K  10,000 * (M - N) / N**2
# , where M  Sigma i**2 * Vi.
def YulesCharacteristicK(words_woschs, N):
  try:
    freqs = coll.Counter()
    freqs.update(words_woschs)
    vi = coll.Counter()
    vi.update(freqs.values())
    M = sum([(value * value) * vi[value] for key, value in freqs.items()])
    K = 10000 * (M - N) / math.pow(N, 2)
    return K
  except:
    return -1
# -------------------------------------------------------------------------
# -1*sigma(pi*lnpi)
# Shannon and sympsons index are basically diversity indices for any community
def ShannonEntropy(words_woschs, lenght):
  try:
    freqs = coll.Counter()
    freqs.update(words_woschs)
    arr = np.array(list(freqs.values()))
    distribution = 1. * arr
    distribution /= max(1, lenght)
    import scipy as sc
    H = sc.stats.entropy(distribution, base=2)
    # H = sum([(i/lenght)*math.log(i/lenght,math.e) for i in freqs.values()])
    return H
  except:
    return -1
# ------------------------------------------------------------------
# 1 - (sigma(n(n - 1))/N(N-1)
# N is total number of words
# n is the number of each type of word
def SimpsonsIndex(words_woschs, N):
  try:
    freqs = coll.Counter()
    freqs.update(words_woschs)
    n = sum([1.0 * i * (i - 1) for i in freqs.values()])
    D = 1 - (n / (N * (N - 1)))
    return D
  except:
    return -1
# ------------------------------------------------------------------
def FleschReadingEase(words_woschs, l, NoOfsentences):
  try:
    scount = 0
    for word in words_woschs:
        scount += syllable_count(word)
    I = 206.835 - 1.015 * (l / float(NoOfsentences)) - 84.6 * (scount / float(l))
    return I
  except:
    return -1
# -------------------------------------------------------------------
def FleschCincadeGradeLevel(words_woschs, l, NoOfSentences):
  try:
    scount = 0
    for word in words_woschs:
        scount += syllable_count(word)
    F = 0.39 * (l / NoOfSentences) + 11.8 * (scount / float(l)) - 15.59
    return F
  except:
    return -1
# -----------------------------------------------------------------
def dale_chall_readability_formula(words_woschs, NoOfWords, NoOfSectences):
  try:
    difficult = 0
    adjusted = 0
    with open('/gdrive/My Drive/My Developments/Whale and Jaguar/dale-chall.pkl', 'rb') as f:
        fimiliarWords = pickle.load(f)
    for word in words_woschs:
        if word not in fimiliarWords:
            difficult += 1
    percent = (difficult / NoOfWords) * 100
    if (percent > 5):
        adjusted = 3.6365
    D = 0.1579 * (percent) + 0.0496 * (NoOfWords / NoOfSectences) + adjusted
    return D
  except:
    return -1
# ------------------------------------------------------------------
def GunningFoxIndex(words_woschs, len_words_woschs, NoOfSentences):
  try:
    NoOFWords = float(len_words_woschs)
    complexWords = 0
    for word in words_woschs:
        if (syllable_count(word) > 2):
            complexWords += 1
    G = 0.4 * ((NoOFWords / NoOfSentences) + 100 * (complexWords / NoOFWords))
    return G
  except:
    return -1

In [0]:
# Read docs
#df = pd.read_json (r'/gdrive/My Drive/My Developments/Whale and Jaguar/News_Category_Dataset_v2.json', lines=True)
df = pd.read_excel('/gdrive/My Drive/My Developments/Whale and Jaguar/News_Category_Dataset_stylometry.xlsx', sheet_name='Sheet1')

In [0]:
## DATA TRANSFORMATION

# standardize separator from "authors" field
sep = ', '
auth = []
main_author = []

for line in df['authors']:
  a = standard_sep(line,sep)
  a = a.split(sep) 
  auth.append(a)
  main_author.append(a[0])

df['authors_sep'] = auth #separate authors
df['main_author'] = main_author #identify main author
df['text'] = df['headline'] + ". " + df['short_description'] #join texts
'''
# Detect language
job_lang = []
for doc in df['text']:
  jl = detect_lang(doc, default_lang='english')
  job_lang.append(jl)
df['language'] = job_lang
'''
texts_clean = []
texts_stem = []
text_lemma = []
count = 0
for doc in df['text'].values.tolist():
  # Remove stopwords and other cleansing
  doc_clean = clean_text(doc, lang='english')
  texts_clean.append(doc_clean)
  #Stemma
  doc_stem = stem_text(doc_clean, lang='english')
  texts_stem.append(doc_stem)
  #Lemma
  doc_lemma = lemma_text(doc_clean, lang='english')
  texts_lemma.append(doc_stem)
  count = count + 1
  if count % 1000 == 0:
    print("line "+str(count)+ " advance "+str(round(100*count/len(df),2))+"%")

df['text_clean'] = texts_clean
df['text_stem'] = texts_stem
df['text_lemma'] = texts_lemma

# merge some classes
df['category_merged']=df['category'].replace({"HEALTHY LIVING": "WELLNESS",
                                              "QUEER VOICES": "GROUPS VOICES",
                                              "BUSINESS": "BUSINESS & FINANCES",
                                              "PARENTS": "PARENTING",
                                              "BLACK VOICES": "GROUPS VOICES",
                                              "THE WORLDPOST": "WORLD NEWS",
                                              "STYLE": "STYLE & BEAUTY",
                                              "GREEN": "ENVIRONMENT",
                                              "TASTE": "FOOD & DRINK",
                                              "WORLDPOST": "WORLD NEWS",
                                              "SCIENCE": "SCIENCE & TECH",
                                              "TECH": "SCIENCE & TECH",
                                              "MONEY": "BUSINESS & FINANCES",
                                              "ARTS": "ARTS & CULTURE",
                                              "COLLEGE": "EDUCATION",
                                              "LATINO VOICES": "GROUPS VOICES",
                                              "CULTURE & ARTS": "ARTS & CULTURE",
                                              "FIFTY": "MISCELLANEOUS",
                                              "GOOD NEWS": "MISCELLANEOUS"})

In [0]:
## TEXT CLUSTERING

# best LDA
documents = df['text_clean'].sample(n=20000,replace=True).values.tolist()
search_params = {'n_components': [10, 15, 20, 25, 30]}
best_model_params, best_model_score, best_model_perplexity = best_LDA(search_params, documents)

In [0]:
## TEXT CLUSTERING

documents = df['text_clean'].astype(str)
n_topics = 15
n_words = 10

# LDA
LDA_topic, LDA_topic_words = LDA_model(documents, n_topics, n_words)
df['LDA_topic'] = LDA_topic
# kMeans
#kMeans_topic = kMeans_topics(documents, n_topics)
#df['kMeans_topic'] = kMeans_topic

In [0]:
style.use("ggplot")

# STYLOMETRIC FEATURE EXTRACTION

cmuDictionary = None
texts = df['text'].tolist()
start_time = time.time()
vector = FeatureExtraction(texts, winSize=10, step=10)
end_time = time.time()
print("elapsed time: "+str((end_time - start_time)/60)+" min")

feature_names = pd.DataFrame(vector,
                             columns =['Avg_wordLength', 'Avg_SentLenghtByCh', 'Avg_SentLenghtByWord', 'Avg_Syllable_per_Word', 'CountSpecialCharacter', 
                                       'CountPunctuation', 'CountFunctionalWords', 'typeTokenRatio', 'hapax', 'HonoreMeasureR', 
                                       'dihapax', 'SichelesMeasureS', 'YulesCharacteristicK','SimpsonsIndex', 'BrunetsMeasureW', 
                                       'ShannonEntropy', 'FleschReadingEase', 'FleschCincadeGradeLevel', 'dale_chall_readability_formula', 'GunningFoxIndex'])

df = pd.concat([df.reset_index(drop=True), feature_names], axis=1)

In [0]:
# STYLOMETRIC CLUSTERING

vector=df.iloc[:, 13:]
vector.fillna(0, inplace=True)
vector = vector.values.tolist()
#kMeans_ElbowMethod(np.array(vector), kmin=1,kmax=35)

labels = kMeans_clusters(vector, K=15)
df["stylometric_km_group"] = labels

In [0]:
# CATEGORY CLASSIFICATION: data preparation

## Prepare dataset
selected = ['text_clean','text_stem','category_merged']
# Create training dataframe
df2=df[selected].sample(n=len(df)) #re-sort rows
c_size=min(pd.value_counts(df2.category_merged)) #finds smallest category
TestSize = 0.15 #sets maximun train size for the smallest category
columns = df2.columns #gets columns names
categories = df2['category_merged'].unique() # obtains nique values for categories
df_train = pd.DataFrame(columns=columns) # creates empty pd.dataframe
for cat in categories:
  df_ = df2[df2.category_merged == cat][:int((1-TestSize)*c_size)] # selects rows for training per category
  df_train = df_train.append(df_) # appends to training df

#create test dataframe
df_test = df.drop(df_train.index)
df_test = df_test.sample(n = len(df_train))

#fill NaN
df_train = df_train.fillna("0000")
df_test = df_test.fillna("0000")

# Separate x from Y
x_train = df_train['text_clean'].tolist()
Y_train = df_train['category_merged'].tolist()
x_test = df_test['text_clean'].tolist()
Y_real = df_test['category_merged'].tolist()

In [0]:
# GRID OPTIMIZATION

# Set pipeline:
# 1. vectorize
# 2. transform to TF-IDF
# 3. Assign classifiers
# 4. ensemble by voting

classifiers = [('svm', SVC(C=1, kernel='linear')), ('rf', RandomForestClassifier(n_estimators=100)), ('nb', MultinomialNB()), ('lgb',LGBMClassifier(n_estimators=100))]
param_grid = {'voting__weights': [[1, 1, 1, 0], [1, 1, 0, 1], [1, 0, 1, 1], [0, 1, 1, 1]]}
clf = RandomForestClassifier(n_estimators=100)
pl = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('voting', VotingClassifier(classifiers, voting="soft"))])

# Train grid model
grid_search = GridSearchCV(pl, param_grid=param_grid, n_jobs=-1, verbose=10, scoring='accuracy')
grid_search.fit(x_train, Y_train)
cv_results = grid_search.cv_results_
best_parameters = grid_search.best_estimator_.get_params()


In [0]:
# CATEGORY CLASSIFICATION

# 1. vectorize
# 2. transform to TF-IDF
# 3. Assign classifiers
# 4. ensemble by voting

classifiers = [('svm', SVC(C=1, kernel='linear')), 
               ('rf', RandomForestClassifier(n_estimators=100)), 
               ('nb', MultinomialNB())]

clf = RandomForestClassifier(n_estimators=100)
pl = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), 
               ('voting', VotingClassifier(estimators=classifiers, voting='soft',weights=[1,1,1]))])
# Train model
model = pl.fit(x_train, Y_train)
# saves model to file
pickle.dump(model, open("/gdrive/My Drive/My Developments/Whale and Jaguar/model_ensemble.p", "wb"))

In [0]:
# CATEGORY CLASSIFICATION

Y = df['category_merged'].tolist()
x = df['text_clean'].fillna("notexthere").tolist()
Prediction = model.predict(x)
CM = sklearn.metrics.confusion_matrix(Y, Prediction)
accuracy = sklearn.metrics.accuracy_score(Y, Prediction)
print("accuracy: "+str(accuracy))
print(sklearn.metrics.classification_report(Y, Prediction))

df["category_predicted"] = Prediction

In [0]:
#df.to_excel(r'/gdrive/My Drive/My Developments/Whale and Jaguar/News_Category_Dataset_processed.xlsx', index=False)
LDA_topic_words.to_excel(r'/gdrive/My Drive/My Developments/Whale and Jaguar/News_Category_Dataset_LDA_topic_table.xlsx', index=False)

In [0]:
from sklearn.model_selection import GridSearchCV