# Welome to Tech Day NLP Demonstration

In [1]:
import pandas as pd
import numpy as np
import time
import os
from os import path

pd.options.display.max_columns = None

In [23]:
def read_data2():
    df = pd.read_csv('./data/file.csv')
    
    # Create a temporary list of sources starting from the first instance of sources 
    # and returning unique values in the 1st column (the column containing source names)
    # This also pulls in extra instances of 'Source' in the dataframe
    temp_sources = list(df.iloc[((df[df[df.columns[1]] == 'Source'].index[0])+1):,1].unique())
    
    # Create source list and use a for loop to fill the source list
    # using the temporary source list but excluding 'Source'
    source_list = []
    for i in temp_sources:
        if i != 'Source':
            source_list.append(i.lower())
    
    # Sanity Check
    # print(source_list)
    
    # Determine the index values in the data frame where 'Source' 
    # is found in column 1. This is used later to slice the dataframe into
    # sub dataframes
    source_breaks = list(df[df[df.columns[1]] == 'Source'].index)
    
    # sanity check
    # print(source_breaks)
    
    # print(len(source_list), len(source_breaks))
    
    # create some empty holder lists and dictionaries; 'd' is going to be a dictionary of dataframes
    # The other temp lists and dictionary are used in creating these sub dataframes and associated column names
    d = {}
    bad_cols = []
    good_cols = []
    temp_dict = {}
    
    # Create a for loop to iterate through source_list to construct sub dataframes
    for i, vals in enumerate(source_list):
        
        # Create a temp var 'j' for indexing/slicing the dataframe with iloc
        j = i+1
        
        # Create an if/else to test if the value of 'j' exceeds the length of the source_list
        # If it does not, create some sub dataframes using the value of 'j' as the 'stop' point 
        # for iloc slice.
        #
        # If it does, use the length of the main dataframe as the 'stop' point for the iloc slice
        if j < len(source_list):
            d['{0}'.format(vals)] = df.iloc[source_breaks[i]+1: source_breaks[j]]
            
            # Rename the columns by creating a dictionary from the bad column names and
            # the actual column names found at a specific location in the main dataframe
            bad_cols = list(d['{0}'.format(vals)].columns)
            good_cols = list(df.iloc[source_breaks[i]])
            temp_dict = dict(zip(bad_cols, good_cols))
            
            d['{0}'.format(vals)].rename(columns=temp_dict, inplace=True)
            d['{0}'.format(vals)].reset_index(inplace=True, drop=True)
            
        else:
            d['{0}'.format(vals)] = df.iloc[source_breaks[i]+1:]
            
            # Rename the columns by creating a dictionary from the bad column names and
            # the actual column names found at a specific location in the main dataframe
            bad_cols = list(d['{0}'.format(vals)].columns)
            good_cols = list(df.iloc[source_breaks[i]])
            temp_dict = dict(zip(bad_cols, good_cols))
            d['{0}'.format(vals)].rename(columns=temp_dict, inplace=True)
            d['{0}'.format(vals)].reset_index(inplace=True, drop=True)
    
    # This for loop formats date-time columns to appropriate datatype.
    # This is by far the most time-instensive process of this function
    # Time savings could be realized if ONLY dates are needed and not time & date.
    #
    # I am also using this for loop to trim unneeded rows in the INSTAGRAM
    # and YOUTUBE dataframes.
    #
    # By using the lambda function and dictionary generator, I was able to achieve a
    # 4 second increase in processing speed.
    for key in d.keys():
        
        # Because INSTAGRAM and YOUTUBE have different column structures than all others,
        # set up an if/else structure to process these separately
        if key == 'instagram':
           
            # Dictionary generator to create a dates dictionary with the unique date string
            # from the original dataframe and associating the to_datetime formatted date to 
            # said string
            dates = {date:pd.to_datetime(date) for date in d[key]['Date'].unique()}
            
            # use a lambda function to apply the new dates to the dataframe
            d[key]['Date'] = d[key]['Date'].apply(lambda v: dates[v])
            
            # Now use an apply function to apply to_datetime to all entries in the 'Time' column
            d[key]['Time'] = d[key]['Time'].apply(pd.to_datetime, errors='ignore')
            
            #d[key][['Date', 'Time']] = d[key][['Date', 'Time']].apply(pd.to_datetime, errors='ignore')
            
            # Trim the dataframe to only relevant rows
            d[key] = d[key].iloc[:, 2:24]
        elif key == 'youtube':
            dates = {date:pd.to_datetime(date) for date in d[key]['Date'].unique()}
            d[key]['Date'] = d[key]['Date'].apply(lambda v: dates[v])
            d[key]['Time'] = d[key]['Time'].apply(pd.to_datetime, errors='ignore')
            #d[key][['Date', 'Time']] = d[key][['Date', 'Time']].apply(pd.to_datetime, errors='ignore')
            d[key] = d[key].iloc[:, 2:17]
        else:
            dates = {date:pd.to_datetime(date) for date in d[key]['Date(ET)'].unique()}
            d[key]['Date(ET)'] = d[key]['Date(ET)'].apply(lambda v: dates[v])
            d[key][['Time(ET)', 'LocalTime']] = d[key][['Time(ET)', 'LocalTime']].apply(pd.to_datetime, errors='ignore')
            d[key] = d[key].iloc[:, 2:]
    
    # Make a call to files_out function in order to output the newly generated dataframes
    files_out(d)
    
    return(d)

In [24]:
def files_out(d):
    # Create an output directory for the new dataframes
    # if the directory already exists, just pass
    try:
        os.makedirs('./data/clean_dfs/')
    except FileExistsError:
        pass
    
    # Set up a for loop to iterate through all dataframes in the dictionary
    # If an output file does NOT already exist for that dataframe, 
    # create a new output file
    #
    # If the output file DOES exist, open that file in 'append' mode
    # and append the new data to the end without header information.
    #
    # Keeping the pandas index in place, so when data is read in for manipulation later,
    # set index_col = 0
    #
    # Also, index will have to be reset upon initial read in later as this append method
    # does not reset the index.  I am doing this in the interest of time savings.
    
    for key in d.keys():
        if not path.exists('./data/clean_dfs/{0}.csv'.format(key)):
            print('file {0}.csv does not exist. Creating new file'.format(key))
            d[key].to_csv('./data/clean_dfs/{0}.csv'.format(key), index=False)
        else:
            print('file {0}.csv does exist. Appending new data to existing file'.format(key))
            df_temp = pd.read_csv('./data/clean_dfs/{0}.csv'.format(key))
            df_temp = df_temp.append(d[key], ignore_index=True)
            df_temp.to_csv('./data/clean_dfs/{0}.csv'.format(key), index=False)
    

In [25]:
start = time.time()
df = read_data2()
end = time.time()
print(end - start)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


file news.csv does not exist. Creating new file
file forums.csv does not exist. Creating new file
file tumblr.csv does not exist. Creating new file
file twitter.csv does not exist. Creating new file
file instagram.csv does not exist. Creating new file
17.9628963470459
