# Welome to Tech Day NLP Demonstration

In [1]:
import pandas as pd
import numpy as np
import time
import os
from os import path
from string import punctuation
import re
import matplotlib.pyplot as plt

import plotly

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

pd.options.display.max_columns = None
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\593379\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_data():
    df = pd.read_csv('./data/file.csv')
    
    # Create a temporary list of sources starting from the first instance of sources 
    # and returning unique values in the 1st column (the column containing source names)
    # This also pulls in extra instances of 'Source' in the dataframe
    temp_sources = list(df.iloc[((df[df[df.columns[1]] == 'Source'].index[0])+1):,1].unique())
    
    # Create source list and use a for loop to fill the source list
    # using the temporary source list but excluding 'Source'
    source_list = []
    for i in temp_sources:
        if i != 'Source':
            source_list.append(i.lower())
    
    # Sanity Check
    # print(source_list)
    
    # Determine the index values in the data frame where 'Source' 
    # is found in column 1. This is used later to slice the dataframe into
    # sub dataframes
    source_breaks = list(df[df[df.columns[1]] == 'Source'].index)
    
    # sanity check
    # print(source_breaks)
    
    # print(len(source_list), len(source_breaks))
    
    # create some empty holder lists and dictionaries; 'd' is going to be a dictionary of dataframes
    # The other temp lists and dictionary are used in creating these sub dataframes and associated column names
    d = {}
    bad_cols = []
    good_cols = []
    temp_dict = {}
    
    # Create a for loop to iterate through source_list to construct sub dataframes
    for i, vals in enumerate(source_list):
        
        # Create a temp var 'j' for indexing/slicing the dataframe with iloc
        j = i+1
        
        # Create an if/else to test if the value of 'j' exceeds the length of the source_list
        # If it does not, create some sub dataframes using the value of 'j' as the 'stop' point 
        # for iloc slice.
        #
        # If it does, use the length of the main dataframe as the 'stop' point for the iloc slice
        if j < len(source_list):
            d['{0}'.format(vals)] = df.iloc[source_breaks[i]+1: source_breaks[j]]
            
            # Rename the columns by creating a dictionary from the bad column names and
            # the actual column names found at a specific location in the main dataframe
            bad_cols = list(d['{0}'.format(vals)].columns)
            good_cols = list(df.iloc[source_breaks[i]])
            temp_dict = dict(zip(bad_cols, good_cols))
            
            d['{0}'.format(vals)].rename(columns=temp_dict, inplace=True)
            d['{0}'.format(vals)].reset_index(inplace=True, drop=True)
            
        else:
            d['{0}'.format(vals)] = df.iloc[source_breaks[i]+1:]
            
            # Rename the columns by creating a dictionary from the bad column names and
            # the actual column names found at a specific location in the main dataframe
            bad_cols = list(d['{0}'.format(vals)].columns)
            good_cols = list(df.iloc[source_breaks[i]])
            temp_dict = dict(zip(bad_cols, good_cols))
            d['{0}'.format(vals)].rename(columns=temp_dict, inplace=True)
            d['{0}'.format(vals)].reset_index(inplace=True, drop=True)
    
    # This for loop formats date-time columns to appropriate datatype.
    # This is by far the most time-instensive process of this function
    # Time savings could be realized if ONLY dates are needed and not time & date.
    #
    # I am also using this for loop to trim unneeded rows in the INSTAGRAM
    # and YOUTUBE dataframes.
    #
    # By using the lambda function and dictionary generator, I was able to achieve a
    # 4 second increase in processing speed.
    for key in d.keys():
        
        # Because INSTAGRAM and YOUTUBE have different column structures than all others,
        # set up an if/else structure to process these separately
        if key == 'instagram':
           
            # Dictionary generator to create a dates dictionary with the unique date string
            # from the original dataframe and associating the to_datetime formatted date to 
            # said string
            dates = {date:pd.to_datetime(date) for date in d[key]['Date'].unique()}
            
            # use a lambda function to apply the new dates to the dataframe
            d[key]['Date'] = d[key]['Date'].apply(lambda v: dates[v])
            
            # Now use an apply function to apply to_datetime to all entries in the 'Time' column
            d[key]['Time'] = d[key]['Time'].apply(pd.to_datetime, errors='ignore')
            
            #d[key][['Date', 'Time']] = d[key][['Date', 'Time']].apply(pd.to_datetime, errors='ignore')
            
            # Trim the dataframe to only relevant rows
            d[key] = d[key].iloc[:, 2:24]
        elif key == 'youtube':
            dates = {date:pd.to_datetime(date) for date in d[key]['Date'].unique()}
            d[key]['Date'] = d[key]['Date'].apply(lambda v: dates[v])
            d[key]['Time'] = d[key]['Time'].apply(pd.to_datetime, errors='ignore')
            #d[key][['Date', 'Time']] = d[key][['Date', 'Time']].apply(pd.to_datetime, errors='ignore')
            d[key] = d[key].iloc[:, 2:17]
        else:
            dates = {date:pd.to_datetime(date) for date in d[key]['Date(ET)'].unique()}
            d[key]['Date(ET)'] = d[key]['Date(ET)'].apply(lambda v: dates[v])
            d[key][['Time(ET)', 'LocalTime']] = d[key][['Time(ET)', 'LocalTime']].apply(pd.to_datetime, errors='ignore')
            d[key] = d[key].iloc[:, 2:]
            
    clean_cols(d)
    
    
    for key in d.keys():
        d['{0}'.format(key)]['Cleaned_Text'] = d['{0}'.format(key)]['Cleaned_Text'].apply(cleanup)
    
    files_out(d)
    
    return(d)

In [3]:
def files_out(d):
    # Create an output directory for the new dataframes
    # if the directory already exists, just pass
    try:
        os.makedirs('./data/clean_dfs/')
    except FileExistsError:
        pass
    
    # Set up a for loop to iterate through all dataframes in the dictionary
    # If an output file does NOT already exist for that dataframe, 
    # create a new output file
    #
    # If the output file DOES exist, open that file in 'append' mode
    # and append the new data to the end without header information.
    #
    # Keeping the pandas index in place, so when data is read in for manipulation later,
    # set index_col = 0
    #
    # Also, index will have to be reset upon initial read in later as this append method
    # does not reset the index.  I am doing this in the interest of time savings.
    
    for key in d.keys():
        if not path.exists('./data/clean_dfs/{0}.csv'.format(key)):
            print('file {0}.csv does not exist. Creating new file'.format(key))
            d[key].to_csv('./data/clean_dfs/{0}.csv'.format(key), index=False)
        else:
            print('file {0}.csv does exist. Appending new data to existing file'.format(key))
            df_temp = pd.read_csv('./data/clean_dfs/{0}.csv'.format(key))
            df_temp = df_temp.append(d[key], ignore_index=True)
            df_temp.to_csv('./data/clean_dfs/{0}.csv'.format(key), index=False)
    

In [4]:
def clean_cols(df):
    for key in df.keys():
        df['{0}'.format(key)]['Cleaned_Text'] = df['{0}'.format(key)]['Contents']

    for key in df.keys():
        if key != 'instagram':
            if df['{0}'.format(key)]['Summary'] is not np.NaN:
                df['{0}'.format(key)]['Cleaned_Text'].fillna(df['{0}'.format(key)]['Summary'], inplace=True)
            if df['{0}'.format(key)]['Title'] is not np.NaN:
                df['{0}'.format(key)]['Cleaned_Text'].fillna(df['{0}'.format(key)]['Title'], inplace=True)
            if df['{0}'.format(key)]['Snippet'] is not np.NaN: 
                df['{0}'.format(key)]['Cleaned_Text'].fillna(df['{0}'.format(key)]['Snippet'], inplace=True)
        else:
            continue
    return(df)

In [5]:
def cleanup(text):
    
    #rusw_df = pd.read_csv('./Jim/rusw.csv')
    #stopwords = list(rusw_df.columns[0:1001])
    #stopwords_string = ''
    #for
        #stopwords_string = stopwords_string + i +' '
        
    ru_stopwords = stopwords.words('russian')
    rustopwords = ' '.join(ru_stopwords[:])
    
    
    
    #cstart = time.time()
    # insert a space between hashtags
    
    text = re.sub("\#"," ", text)
    
    # remove urls 
    
    text = re.sub("(http|https)\:\/\/.*\s?","",text)
    
    # remove HTML tags
    
    text = re.sub("\<.*\>","",text)
    
    # remove punctuation
    
    text = text.translate(str.maketrans('', '', punctuation))
    #cend = time.time()
    #print(text)
    #print("cleanup time:",cend - cstart)
   
    text = re.sub(rustopwords, '', text)
    
    
    
    tokens = [token for token in text.lower().split() if token not in rustopwords\
            and token != " "]
    
    text = " ".join(tokens)
    
    
    #return text.strip()
    return text

In [6]:
start = time.time()
df = read_data()
end = time.time()
print(end - start)


Columns (25,26,27,32) have mixed types. Specify dtype option on import or set low_memory=False.



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/inde

file news.csv does not exist. Creating new file
file forums.csv does not exist. Creating new file
file tumblr.csv does not exist. Creating new file
file twitter.csv does not exist. Creating new file
file instagram.csv does not exist. Creating new file
101.20800828933716


In [None]:
dateparse = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

twitterdf = pd.read_csv('./data/clean_dfs/forums.csv', parse_dates=['Date(ET)', 'LocalTime', 'Time(ET)'], date_parser=dateparse)

#twitterdf = pd.read_csv('./data/clean_dfs/twitter.csv', parse_dates = True)

#df.groupby(pd.Grouper(key='Date(ET)', freq='M'))['Cleaned_Text'].sum()

#twitterdf.set_index('Date(ET)').groupby('Date(ET)')['Cleaned_Text'].resample("M").sum()
print(twitterdf.dtypes)
display(twitterdf.head())


dfthing = pd.DataFrame(twitterdf.groupby(pd.Grouper(key='Date(ET)', freq='W'))['Contents'].count())

dfthing.reset_index(drop=False, inplace=True)
dfthing

In [None]:
import seaborn as sns

In [None]:
count_list = list(dfthing.Contents)
date_list = list(dfthing['Date(ET)'])
#print(count_list, date_list)
plt.figure(figsize=(15,8))
sns.lineplot(x='Date(ET)', y='Contents', data=dfthing, color='red', lw=4)
plt.xlabel('Years', fontsize=22)
plt.ylabel('Post Count', fontsize=22)
plt.title('Trending', fontsize=22, fontweight='bold')
plt.xticks(rotation=45)

plt.show()

In [None]:
gapminder = plotly.data.dfthing()
fig = plotly.line(gapminder, x="Date(ET)", y="Contents", title='Life expectancy in Canada')
fig.show()