## **In this notebook**, 

1.   Loading csv files
3.   Checks data and cleans it
4.   Generating stats for csv file

### Importing required libraries

In [170]:
import pandas as pd
import numpy as np
from datetime import datetime
from nltk.tokenize import WhitespaceTokenizer

## **Loading and processing master log csv file for one subreddit**

In [None]:
com_science_df = pd.read_csv('/content/com_stream_science.csv')
com_france_df = pd.read_csv('/content/com_stream_france.csv')
com_rance_df = pd.read_csv('/content/com_stream_rance.csv')
com_geopolitics_df = pd.read_csv('/content/com_stream_geopolitics.csv')
com_europe_df = pd.read_csv('/content/com_stream_europe.csv')
com_askeurope_df = pd.read_csv('/content/com_stream_askeurope.csv')

In [145]:
sub_science_df = pd.read_csv('/content/sub_stream_science.csv')
sub_france_df = pd.read_csv('/content/sub_stream_france.csv')
sub_rance_df = pd.read_csv('/content/sub_stream_rance.csv')
sub_geopolitics_df = pd.read_csv('/content/sub_stream_geopolitics.csv')
sub_europe_df = pd.read_csv('/content/sub_stream_europe.csv')
sub_askeurope_df = pd.read_csv('/content/sub_stream_askeurope.csv')

# **Checking data and cleaning the master log data**

#### For checking and debugging dataframe data 

In [None]:
com_science_df[com_science_df.score.isna()] #for com
# if output is empty, we are good
# this is the case when the csv file is not appended properly, columns are mismatched.

In [None]:
sub_science_df[sub_science_df.upvote_ratio.isna()] # for posts
# if output is empty, we are good
# this is the case when the csv file is not appended properly, columns are mismatched.

In [159]:
# To update the dataframe after removing rows which are of no use.
# Use the above cell to check the rows in above cell, to see if they should be removed or not.
# score andupvote_ratio would be empty if the csv files are not being appended properly

def cleaning(df, df_type):
  # print('before: ', len(df))
  if df_type == 'com':
    df = df[df.body.isna()==False]
    df = df[df.score.isna()==False]
  else:
    df = df[df.upvote_ratio.isna()==False]
  
  df = df[df.author.isna()==False]
  df = df[df.id.isna()==False]
  df = df[df.created_utc.isna()==False]
  df.created_utc = df.created_utc.apply(lambda x: float(x))

  # print('after: ', len(df))
  return df

#### Cleaning **comment** dataframes

In [None]:
com_science_df = cleaning(com_science_df, 'com')
com_france_df = cleaning(com_france_df, 'com')
com_rance_df = cleaning(com_rance_df, 'com')
com_geopolitics_df = cleaning(com_geopolitics_df, 'com')
com_askeurope_df = cleaning(com_askeurope_df, 'com')
com_europe_df = cleaning(com_europe_df, 'com')

#### Cleaning **submissions/posts** dataframes

In [None]:
sub_science_df = cleaning(sub_science_df, 'sub')
sub_france_df = cleaning(sub_france_df, 'sub')
sub_rance_df = cleaning(sub_rance_df, 'sub')
sub_geopolitics_df = cleaning(sub_geopolitics_df, 'sub')
sub_askeurope_df = cleaning(sub_askeurope_df, 'sub')
sub_europe_df = cleaning(sub_europe_df, 'sub')

#**Word Count and stats**

In [168]:
def word_count(df, df_type):
  tk = WhitespaceTokenizer()
  if df_type == 'com':
    df['word_count'] = df.body.apply(lambda x: len(tk.tokenize(str(x))))
  else:
    # Concatenating selftext and title for post
    df['content'] = np.where(df['selftext'].isnull(), df['title'], df['title']+df['selftext'])
    df['word_count'] = df.content.apply(lambda x: len(tk.tokenize(x)))
    
  return str(df.word_count.sum())

#### Stats for **comments** dataframes

In [178]:
print('Data Timestamp: '+str(datetime.fromtimestamp(com_france_df.created_utc.min()).date().strftime('%b-%d-%Y'))+' to '+str(datetime.fromtimestamp(com_france_df.created_utc.max()).date().strftime('%b-%d-%Y')))
print("\nScience ------- Number of comments: "+str(len(com_science_df))+" ------- Word count: "+word_count(com_science_df, 'com'))
print("\nFrance ------- Number of comments: "+str(len(com_france_df))+" ------- Word count: "+word_count(com_france_df, 'com'))
print("\nRance ------- Number of comments: "+str(len(com_rance_df))+" ------- Word count: "+word_count(com_rance_df, 'com'))
print("\nEurope ------- Number of comments: "+str(len(com_europe_df))+" ------- Word count: "+word_count(com_europe_df, 'com'))
print("\nAskEurope ------- Number of comments: "+str(len(com_askeurope_df))+" ------- Word count: "+word_count(com_askeurope_df, 'com'))
print("\nGeopolitics ------- Number of comments: "+str(len(com_geopolitics_df))+" ------- Word count: "+word_count(com_geopolitics_df, 'com'))


Data Timestamp: May-09-2022 to Aug-27-2022

Science ------- Number of comments: 523307 ------- Word count: 19979361

France ------- Number of comments: 706583 ------- Word count: 30022514

Rance ------- Number of comments: 112319 ------- Word count: 1999828

Europe ------- Number of comments: 869751 ------- Word count: 31977386

AskEurope ------- Number of comments: 72994 ------- Word count: 3512914

Geopolitics ------- Number of comments: 20787 ------- Word count: 1492885


#### Stats for **submission/posts** dataframes

In [179]:
print('Data Timestamp: '+str(datetime.fromtimestamp(sub_france_df.created_utc.min()).date().strftime('%b-%d-%Y'))+' to '+str(datetime.fromtimestamp(sub_france_df.created_utc.max()).date().strftime('%b-%d-%Y')))
print("\nScience ------- Number of posts: "+str(len(sub_science_df))+" ------- Word count: "+word_count(sub_science_df, 'sub'))
print("\nFrance ------- Number of posts: "+str(len(sub_france_df))+" ------- Word count: "+word_count(sub_france_df, 'sub'))
print("\nRance ------- Number of posts: "+str(len(sub_rance_df))+" ------- Word count: "+word_count(sub_rance_df, 'sub'))
print("\nEurope ------- Number of posts: "+str(len(sub_europe_df))+" ------- Word count: "+word_count(sub_europe_df, 'sub'))
print("\nAskEurope ------- Number of posts: "+str(len(sub_askeurope_df))+" ------- Word count: "+word_count(sub_askeurope_df, 'sub'))
print("\nGeopolitics ------- Number of posts: "+str(len(sub_geopolitics_df))+"------- Word count: "+word_count(sub_geopolitics_df, 'sub'))


Data Timestamp: May-09-2022 to Aug-27-2022

Science ------- Number of posts: 5890 ------- Word count: 128157

France ------- Number of posts: 15862 ------- Word count: 1053922

Rance ------- Number of posts: 5196 ------- Word count: 59373

Europe ------- Number of posts: 11676 ------- Word count: 158378

AskEurope ------- Number of posts: 984 ------- Word count: 91475
