In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict
import gc
%matplotlib inline

In [2]:
columns = ["pubId", "is_hourly", "seqId", "on_homepage", "canonicalUrl",
                   "firstScrape", "lang_iso", "lang_reliability", "title", "text"]

def read_article_df(file):
    articles_dt = file.read().split('\n')[:-1]
    pubId, canonicalUrl,firstScrape,title,text,lang_reliability = [],[],[],[],[],[]
    lang_iso = []
    for article in articles_dt:    
        row = article.split('\t')
        pubId.append(row[0])
        canonicalUrl.append(row[4])
        firstScrape.append(row[5])
        lang_iso.append(row[6])
        lang_reliability.append(row[7])
        title.append(row[8])
        text.append(row[9])

    articles_df = pd.DataFrame()
    articles_df['pubId'] = pubId
    articles_df['canonicalUrl'] = canonicalUrl
    articles_df['firstScrape'] = firstScrape
    articles_df['title'] = title
    articles_df['text'] = text
    articles_df['lang_reliability'] = lang_reliability
    articles_df['lang_iso'] = lang_iso
    return articles_df

In [3]:
article_df = read_article_df(open('../data/raw/2018_07_19_04_59_08/articles.txt', encoding='utf-8'))
article_df = article_df[article_df.lang_reliability == '1'].reset_index(drop=True)
article_df['title'] = article_df.title.apply(lambda s: s.strip())
article_df['text'] = article_df.text.apply(lambda s: s.strip())
article_df['title_len'] = article_df.title.apply(len)
article_df['text_len'] = article_df.text.apply(len)

In [4]:
clean_df = article_df[article_df.title_len > 0]
clean_df = clean_df[clean_df.text_len > 100]
clean_df = clean_df[clean_df.lang_iso == 'en']
clean_df.shape

(176664, 9)

In [5]:
label1 = pd.read_csv('../data/raw/labels/cave_rescue/lower_bound.txt', header=None)
label1.columns = ['canonicalUrl']
label1['label'] = 'cave_rescue'

label2 = pd.read_csv('../data/raw/labels/duckboat/lower_bound.txt', header=None)
label2.columns = ['canonicalUrl']
label2['label'] = 'duckboat'

label3 = pd.read_csv('../data/raw/labels/helsinki_summit/lower_bound.txt', header=None)
label3.columns = ['canonicalUrl']
label3['label'] = 'helsinki'

label_df = pd.concat([label1, label2, label3])
label_df

Unnamed: 0,canonicalUrl,label
0,bbc.com/news/av/world-asia-44875089/thai-cave-...,cave_rescue
1,indystar.com/story/news/nation-now/2018/07/16/...,cave_rescue
2,washingtonpost.com/world/asia_pacific/these-di...,cave_rescue
3,au.news.yahoo.com/navy-seal-died-thai-cave-res...,cave_rescue
4,yahoo.com/news/m/8adca8cd-6cc3-307c-b109-9cd1d...,cave_rescue
...,...,...
487,chicago.suntimes.com/news/military-veterans-di...,helsinki
488,dailycaller.com/2018/07/19/mike-huckabee-media...,helsinki
489,sfgate.com/news/politics/article/trump-embrace...,helsinki
490,yahoo.com/news/m/78f6000e-d04c-355b-867d-8d5c8...,helsinki


In [6]:
clean_df = clean_df.merge(label_df, on='canonicalUrl', how='left')

In [7]:
article_with_label = clean_df[~clean_df.label.isna()]
article_without_label = clean_df[clean_df.label.isna()]
article_with_label.shape, article_without_label.shape


((140, 10), (176524, 10))

In [8]:
sample_df = pd.concat([article_with_label, article_without_label.sample(10000)])


In [9]:
sample_df

Unnamed: 0,pubId,canonicalUrl,firstScrape,title,text,lang_reliability,lang_iso,title_len,text_len,label
62,290,zerohedge.com/news/2018-07-19/fbi-chief-threat...,7/19/2018 8:26:52 AM -04:00,FBI Chief Threatens To Quit If Trump Invites R...,"by Knave Dave - Jul 18, 2018 1:11 pm ### This ...",1,en,78,2858,helsinki
10696,33,washingtonpost.com/news/morning-mix/wp/2018/07...,7/19/2018 11:51:57 PM -04:00,At least 8 reported dead as duck boat sinks ne...,At least 8 reported dead as duck boat sinks ne...,1,en,90,856,duckboat
10801,33,washingtonpost.com/news/posteverything/wp/2018...,7/19/2018 6:27:03 AM -04:00,"Ukraine’s not a country, Putin told Bush. What...",PostEverything Perspective ### Perspective Int...,1,en,79,8487,helsinki
13312,237,hotair.com/archives/2018/07/19/looking-glass-d...,7/19/2018 1:35:59 PM -04:00,"Through the looking glass: Democrats attack ""R...",Through the looking glass: Democrats attack “R...,1,en,86,4440,helsinki
16299,118,philly.com/philly/news/nation_world/20180719_a...,7/19/2018 11:15:42 PM -04:00,Sheriff: 8 people dead after Missouri tourist ...,Sheriff: 8 people dead after Missouri tourist ...,1,en,68,766,duckboat
...,...,...,...,...,...,...,...,...,...,...
119207,100,nordic.businessinsider.com/roger-federer-lost-...,7/19/2018 11:45:25 AM -04:00,Roger Federer lost his iconic 'RF' logo when h...,Lifestyle ### * Copyright © 2018 Business Insi...,1,en,107,1720,
23450,20,reddit.com/user/diglos76,7/19/2018 7:05:52 AM -04:00,diglos76 (u/diglos76) - Reddit,Press J to jump to the feed. Press question ma...,1,en,30,6605,
44755,1433,record-eagle.com/news/go/mike-terrell-manistee...,7/19/2018 8:46:10 PM -04:00,Mike Terrell: Manistee River offers small stre...,A few passing clouds. Low 68F. Winds light and...,1,en,80,4808,
172391,974,wgntv.com/2018/07/18/t-storms-coming-back-onto...,7/19/2018 2:43:18 PM -04:00,T-storms coming back onto Chicago’s weather sc...,T-storms coming back onto Chicago’s weather sc...,1,en,170,248,


In [10]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
    
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tniyomkarn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tniyomkarn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
import re
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 

## text normzalization
STOPWORDS = set(stopwords.words('english'))

def clean_text_lemma(text):
    """
        text: a paragrapy
        
        return: modified initial string
    """
    text = re.sub(r"[^\w\s]", '', text) ## Remove all non-word characters (everything except numbers and letters)
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwords from text
    text_list = nltk.word_tokenize(text)
    lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in text_list]) # lemmatization
    #text_list2 = nltk.word_tokenize(lemmatized_output)
    #stemmed_output = ' '.join([porter.stem(w) for w in text_list2]) #stemming
    return lemmatized_output

In [12]:
sample_df['clean_text'] = sample_df.apply(lambda row: clean_text_lemma(row['text']),axis=1)

In [13]:
# import codecs
# from datetime import datetime
# import vocabulary
# import time
# from numpy.random import choice
# import numpy
# import functools
# import sys
# import nltk

# sys.setrecursionlimit(10000)


# class HDP_gibbs_sampling:
#     def __init__(self, K0=10, alpha=0.5, beta=0.5,gamma=1.5, docs= None, V= None):
#         self.maxnn = 1
#         self.alss=[] # an array for keep the stirling(N,1:N) for saving time consumming
#         self.K = K0  # initial number of topics
#         self.alpha = alpha # parameter of topics prior
#         self.beta = beta   # parameter of words prior
#         self.gamma = gamma # parameter of tables prior
#         self.docs = docs # a list of documents which include the words
#         self.V = V # number of different words in the vocabulary
#         self.z_m_n = {} # topic assignements for documents
#         self.n_m_z = numpy.zeros((len(self.docs), self.K))      # number of words assigned to topic z in document m
#         self.theta = numpy.zeros((len(self.docs), self.K))
#         self.n_z_t = numpy.zeros((self.K, V)) # number of times a word v is assigned to a topic z
#         self.phi = numpy.zeros((self.K, V))
#         self.n_z = numpy.zeros(self.K)   # total number of words assigned to a topic z
#         self.U1=[] # active topics
#         for i in range (self.K):
#             self.U1.append(i)
        
#         self.U0=[] # deactive topics
#         self.tau=numpy.zeros(self.K+1) +1./self.K
#         for m, doc in enumerate(docs):         # Initialization of the data structures
#             for n,t in enumerate(doc):
#                 z = numpy.random.randint(0, self.K) # Randomly assign a topic to a word and increase the counting array
#                 self.n_m_z[m, z] += 1
#                 self.n_z_t[z, t] += 1
#                 self.n_z[z] += 1
#                 self.z_m_n[(m,n)]=z
    

#     def inference(self,iteration):
#         " Inference of HDP  using Dircet Assignment with ILDA simpilifying "
        
#         for m, doc in enumerate(self.docs):
#                 for n, t in enumerate(doc):
#                     # decrease the counting for word t with topic kold
#                     kold =self.z_m_n[(m,n)]
#                     self.n_m_z[m,kold] -= 1
#                     self.n_z_t[kold, t] -= 1
#                     self.n_z[kold] -= 1
#                     p_z=numpy.zeros(self.K+1)
#                     for kk in range (self.K): # using the z sampling equation in ILDA
#                         k=self.U1[kk]
#                         p_z[kk]=(self.n_m_z[m,k]+self.alpha*self.tau[k])*(self.n_z_t[k,t]+self.beta)/(self.n_z[k]+self.V*self.beta)
#                     p_z[self.K]=(self.alpha*self.tau[self.K])/self.V # additional cordinate for new topic
#                     knew = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
#                     if knew==self.K: # check if topic sample is new
#                         self.z_m_n[(m,n)] = self.spawntopic(m,t) # extend the number of topics and arrays shape and assign the array for new topic
#                         self.updatetau() # update the table distribution over topic


#                     else :
#                         k=self.U1[knew] # do same as LDA
#                         self.z_m_n[(m,n)] = k
#                         self.n_m_z[m,k] += 1
#                         self.n_z_t[k, t] += 1
#                         self.n_z[k] += 1
                    
                    
#                     if self.n_z[kold]==0: # check if the topic have been not used and re shape the arrayes
#                         self.U1.remove(kold)
#                         self.U0.append(kold)
#                         self.K -=1
#                         self.updatetau()

                

#         print ('Iteration:',iteration,'\n','Number of topics:',self.K,'\n','Activated topics:',self.U1,'\n','Deactivated topics',self.U0)


#     def spawntopic (self,m,t): # reshape the arrays for new topic
#         if len(self.U0)>0: # if the we have deactive topics.
#             k=self.U0[0]
#             self.U0.remove(k)
#             self.U1.append(k)
#             self.n_m_z[m,k]=1
#             self.n_z_t[k,t]=1
#             self.n_z[k]=1
            
            
#         else:
#             k=self.K #  if the we do not have deactive topics so far.
#             self.n_m_z=numpy.append(self.n_m_z,numpy.zeros([len(self.docs),1]),1)
#             self.U1.append(k)
#             self.n_m_z[m,k] = 1
#             self.n_z_t=numpy.vstack([self.n_z_t,numpy.zeros(self.V)])
#             self.n_z_t[k, t] = 1
#             self.n_z=numpy.append(self.n_z,1)
#             self.tau=numpy.append(self.tau,0)
        
#         self.K +=1
        
#         return k
    
            
#     def stirling(self,nn): # making an array for keep the stirling(N,1:N) for saving time consumming
#         if len(self.alss)==0:
#             self.alss.append([])
#             self.alss[0].append(1)
#         if nn > self.maxnn:
#             for mm in range (self.maxnn,nn):
#                 ln=len(self.alss[mm-1])+1
#                 self.alss.append([])
                
#                 for xx in range(ln) :
#                     self.alss[mm].append(0)
#                     if xx< (ln-1):
#                         self.alss[mm][xx] += self.alss[mm-1][xx]*mm
#                     if xx>(ln-2) :
#                         self.alss[mm][xx] += 0
#                     if xx==0 :
#                         self.alss[mm][xx] += 0
#                     if xx!=0 :
#                         self.alss[mm][xx] += self.alss[mm-1][xx-1]

#             self.maxnn=nn
#         return self.alss[nn-1]
    
    
    
#     def rand_antoniak(self,alpha, n):
#         # Sample from Antoniak Distribution
#         ss = self.stirling(n)
#         max_val = max(ss)
#         p = numpy.array(ss) / max_val
        
#         aa = 1
#         for i, _ in enumerate(p):
#             p[i] *= aa
#             aa *= alpha
        
#         p = numpy.array(p,dtype='float') / numpy.array(p,dtype='float').sum()
#         return choice(range(1, n+1), p=p)
    
    
    
    
    
    
    
#     def updatetau(self):  # update tau using antoniak sampling from CRM
    
#         m_k=numpy.zeros(self.K+1)
#         for kk in range(self.K):
#             k=self.U1[kk]
#             for m in range(len(self.docs)):
                
#                 if self.n_m_z[m,k]>1 :
#                     m_k[kk]+=self.rand_antoniak(self.alpha*self.tau[k], int(self.n_m_z[m,k]))
#                 else :
#                     m_k[kk]+=self.n_m_z[m,k]
    
#         T=sum(m_k)
#         m_k[self.K]=self.gamma
#         tt=numpy.transpose(numpy.random.dirichlet(m_k, 1))
#         for kk in range(self.K):
#             k=self.U1[kk]
#             self.tau[k]=tt[kk]

#         self.tau[self.K]=tt[self.K]



#     def worddist(self):
#         """topic-word distribution, \phi in Blei'spaper  """
#         return (self.n_z_t +self.beta)/ (self.n_z[:, numpy.newaxis]+self.V*self.beta),len(self.n_z)


# if __name__ == "__main__":
#     corpus = sample_df['text']
#     iterations = 50 # number of iterations for getting converged
#     voca = vocabulary.Vocabulary(excluds_stopwords=False) # find the unique words in the dataset
#     docs = [voca.doc_to_ids(doc) for doc in corpus] # change words of the corpus to ids
#     HDP = HDP_gibbs_sampling(K0=20, alpha=0.5, beta=0.5, gamma=2, docs=docs, V=voca.size()) # initialize the HDP
#     for i in range(iterations):
#         HDP.inference(i)
#     (d,len) = HDP.worddist() # find word distribution of each topic
#     for i in range(len):
#         ind = numpy.argpartition(d[i], -10)[-10:] # top 10 most occured words for each topic
#         for j in ind:
#             print (voca[j],' ',end=""),
#         print ()

In [14]:
import codecs
from datetime import datetime
import vocabulary
import time
from numpy.random import choice
import numpy
import functools
import sys
import nltk

sys.setrecursionlimit(10000)


class HDP_gibbs_sampling:
    def __init__(self, K0=10, alpha=0.5, beta=0.5,gamma=1.5, docs= None, V= None):
        self.maxnn = 1
        self.alss=[] # an array for keep the stirling(N,1:N) for saving time consumming
        self.K = K0  # initial number of topics
        self.alpha = alpha # parameter of topics prior
        self.beta = beta   # parameter of words prior
        self.gamma = gamma # parameter of tables prior
        self.docs = docs # a list of documents which include the words
        self.V = V # number of different words in the vocabulary
        self.z_m_n = {} # topic assignements for documents
        self.n_m_z = numpy.zeros((len(self.docs), self.K))      # number of words assigned to topic z in document m
        self.theta = numpy.zeros((len(self.docs), self.K))
        self.n_z_t = numpy.zeros((self.K, V)) # number of times a word v is assigned to a topic z
        self.phi = numpy.zeros((self.K, V))
        self.n_z = numpy.zeros(self.K)   # total number of words assigned to a topic z
        self.U1=[] # active topics
        for i in range (self.K):
            self.U1.append(i)
        
        self.U0=[] # deactive topics
        self.tau=numpy.zeros(self.K+1) +1./self.K
        for m, doc in enumerate(docs):         # Initialization of the data structures
            for n,t in enumerate(doc):
                z = numpy.random.randint(0, self.K) # Randomly assign a topic to a word and increase the counting array
                self.n_m_z[m, z] += 1
                self.n_z_t[z, t] += 1
                self.n_z[z] += 1
                self.z_m_n[(m,n)]=z
    

    def inference(self,iteration):
        " Inference of HDP  using Dircet Assignment with ILDA simpilifying "
        
        for m, doc in enumerate(self.docs):
                for n, t in enumerate(doc):
                    # decrease the counting for word t with topic kold
                    kold =self.z_m_n[(m,n)]
                    self.n_m_z[m,kold] -= 1
                    self.n_z_t[kold, t] -= 1
                    self.n_z[kold] -= 1
                    p_z=numpy.zeros(self.K+1)
                    for kk in range (self.K): # using the z sampling equation in ILDA
                        k=self.U1[kk]
                        p_z[kk]=(self.n_m_z[m,k]+self.alpha*self.tau[k])*(self.n_z_t[k,t]+self.beta)/(self.n_z[k]+self.V*self.beta)
                    p_z[self.K]=(self.alpha*self.tau[self.K])/self.V # additional cordinate for new topic
                    knew = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
                    if knew==self.K: # check if topic sample is new
                        self.z_m_n[(m,n)] = self.spawntopic(m,t) # extend the number of topics and arrays shape and assign the array for new topic
                        self.updatetau() # update the table distribution over topic


                    else :
                        k=self.U1[knew] # do same as LDA
                        self.z_m_n[(m,n)] = k
                        self.n_m_z[m,k] += 1
                        self.n_z_t[k, t] += 1
                        self.n_z[k] += 1
                    
                    
                    if self.n_z[kold]==0: # check if the topic have been not used and re shape the arrayes
                        self.U1.remove(kold)
                        self.U0.append(kold)
                        self.K -=1
                        self.updatetau()

                

        print ('Iteration:',iteration,'\n','Number of topics:',self.K,'\n','Activated topics:',self.U1,'\n','Deactivated topics',self.U0)


    def spawntopic (self,m,t): # reshape the arrays for new topic
        if len(self.U0)>0: # if the we have deactive topics.
            k=self.U0[0]
            self.U0.remove(k)
            self.U1.append(k)
            self.n_m_z[m,k]=1
            self.n_z_t[k,t]=1
            self.n_z[k]=1
            
            
        else:
            k=self.K #  if the we do not have deactive topics so far.
            self.n_m_z=numpy.append(self.n_m_z,numpy.zeros([len(self.docs),1]),1)
            self.U1.append(k)
            self.n_m_z[m,k] = 1
            self.n_z_t=numpy.vstack([self.n_z_t,numpy.zeros(self.V)])
            self.n_z_t[k, t] = 1
            self.n_z=numpy.append(self.n_z,1)
            self.tau=numpy.append(self.tau,0)
        
        self.K +=1
        
        return k
    
            
    def stirling(self,nn): # making an array for keep the stirling(N,1:N) for saving time consumming
        if len(self.alss)==0:
            self.alss.append([])
            self.alss[0].append(1)
        if nn > self.maxnn:
            for mm in range (self.maxnn,nn):
                ln=len(self.alss[mm-1])+1
                self.alss.append([])
                
                for xx in range(ln) :
                    self.alss[mm].append(0)
                    if xx< (ln-1):
                        self.alss[mm][xx] += self.alss[mm-1][xx]*mm
                    if xx>(ln-2) :
                        self.alss[mm][xx] += 0
                    if xx==0 :
                        self.alss[mm][xx] += 0
                    if xx!=0 :
                        self.alss[mm][xx] += self.alss[mm-1][xx-1]

            self.maxnn=nn
        return self.alss[nn-1]
    
    
    
    def rand_antoniak(self,alpha, n):
        # Sample from Antoniak Distribution
        ss = self.stirling(n)
        max_val = max(ss)
        p = numpy.array(ss) / max_val
        
        aa = 1
        for i, _ in enumerate(p):
            p[i] *= aa
            aa *= alpha
        
        p = numpy.array(p,dtype='float') / numpy.array(p,dtype='float').sum()
        return choice(range(1, n+1), p=p)
    
    
    
    
    
    
    
    def updatetau(self):  # update tau using antoniak sampling from CRM
    
        m_k=numpy.zeros(self.K+1)
        for kk in range(self.K):
            k=self.U1[kk]
            for m in range(len(self.docs)):
                
                if self.n_m_z[m,k]>1 :
                    m_k[kk]+=self.rand_antoniak(self.alpha*self.tau[k], int(self.n_m_z[m,k]))
                else :
                    m_k[kk]+=self.n_m_z[m,k]
    
        T=sum(m_k)
        m_k[self.K]=self.gamma
        tt=numpy.transpose(numpy.random.dirichlet(m_k, 1))
        for kk in range(self.K):
            k=self.U1[kk]
            self.tau[k]=tt[kk]

        self.tau[self.K]=tt[self.K]



    def worddist(self):
        """topic-word distribution, \phi in Blei'spaper  """
        return (self.n_z_t +self.beta)/ (self.n_z[:, numpy.newaxis]+self.V*self.beta),len(self.n_z)

In [15]:
corpus = sample_df['clean_text']
iterations = 20 # number of iterations for getting converged
voca = vocabulary.Vocabulary(excluds_stopwords=False) # find the unique words in the dataset
docs = [voca.doc_to_ids(doc) for doc in corpus] # change words of the corpus to ids
HDP = HDP_gibbs_sampling(K0=20, alpha=0.5, beta=0.5, gamma=2, docs=docs, V=voca.size()) # initialize the HDP
for i in range(iterations):
    HDP.inference(i)
(d,length) = HDP.worddist() # find word distribution of each topic
for i in range(length):
    ind = numpy.argpartition(d[i], -10)[-10:] # top 10 most occured words for each topic
    for j in ind:
        print (voca[j],' ',end=""),
    print ()

Iteration: 0 
 Number of topics: 21 
 Activated topics: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] 
 Deactivated topics []
Iteration: 1 
 Number of topics: 20 
 Activated topics: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 Deactivated topics [20]
Iteration: 2 
 Number of topics: 20 
 Activated topics: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 Deactivated topics [20]
Iteration: 3 
 Number of topics: 20 
 Activated topics: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 Deactivated topics [20]
Iteration: 4 
 Number of topics: 20 
 Activated topics: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 Deactivated topics [20]
Iteration: 5 
 Number of topics: 20 
 Activated topics: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19] 
 Deactivated topics [20]
Iteration: 6 
 Number of topics: 20 
 Activated topics: [0, 1, 2, 3, 4, 5, 6, 7,

In [16]:
sample_df['clean_text']

62        knave dave jul 18 2018 111 pm exactly first st...
10696     least 8 reported dead duck boat sink near bran...
10801     posteverything perspective perspective interpr...
13312     looking glass democrat attack russian bear cha...
16299     sheriff 8 people dead missouri tourist boat ac...
                                ...                        
119207    lifestyle copyright 2018 business insider inc ...
23450     press j jump feed press question mark learn re...
44755     passing cloud low 68f wind light variable toni...
172391    tstorms coming back onto chicago weather scene...
57828     thank registering sent confirmation email data...
Name: clean_text, Length: 10140, dtype: object

In [17]:
sample_df['text']

62        by Knave Dave - Jul 18, 2018 1:11 pm ### This ...
10696     At least 8 reported dead as duck boat sinks ne...
10801     PostEverything Perspective ### Perspective Int...
13312     Through the looking glass: Democrats attack “R...
16299     Sheriff: 8 people dead after Missouri tourist ...
                                ...                        
119207    Lifestyle ### * Copyright © 2018 Business Insi...
23450     Press J to jump to the feed. Press question ma...
44755     A few passing clouds. Low 68F. Winds light and...
172391    T-storms coming back onto Chicago’s weather sc...
57828     ### Thank you for registering! ### We have sen...
Name: text, Length: 10140, dtype: object