In [2]:
# import basic libraries
import os, json
import pandas as pd
import numpy as np
import glob
pd.options.mode.chained_assignment = None


In [3]:
main_folder_path = '../raw_data' 

def parse_all_json(main_folder_path):
    df = pd.DataFrame()

### Iterate through the group of folders
    for folder in os.listdir(main_folder_path):
        folder_path = os.path.join(main_folder_path, folder)
        
        if os.path.isdir(folder_path):
## Iterate through each individual folder
            for file in os.listdir(folder_path):
                file = os.path.join(main_folder_path, folder, file)
            
            #add a channel name column and add which folder is the value coming from: general, labhelp, 
                if file.endswith('.json'):
                    data = pd.read_json(file)
                    data['channel_name'] = folder
                    df = pd.concat([df, data])
    return df

#def get_text_content(df):
#return df['text']

#if __name__ == '__main__':
data = parse_all_json('../raw_data/')

In [4]:
# save to csv
data.to_csv(r'../csv/data_raw.csv', index = False)

In [10]:
df = data.copy()

df_clean = df.copy()


In [6]:
def clean_dataframe(df_clean):
    """this function is applied to clean the dataframe
    """
    # drop columns not needed
    df_clean.drop(['type', 'client_msg_id', 'team', 'user_team',
             'source_team', 'blocks', 'upload', 'display_as_bot',
             'thread_ts', 'latest_reply', 'is_locked', 'subscribed',
             'parent_user_id', 'bot_id', 'bot_profile', 'last_read', 'edited',
             'purpose', 'inviter', 'topic', 'root', 'old_name', 'name', 'hidden',
             'x_files'], axis=1, inplace=True)
    
    # filter out for the rows which has subtype values
    df_clean = df_clean[(df_clean.subtype != 'channel_join') & 
                                (df_clean.subtype != 'channel_join') &
                                (df_clean.subtype != 'channel_purpose') &
                                (df_clean.subtype != 'thread_broadcast')]
    # drop subtype column with the values we don't need anymore
    df_clean.drop('subtype', axis=1, inplace=True) 
    
    return df_clean

In [7]:
def datetime_wrangling(df_clean):
    
    """this function is applied to summarise wrangling steps with datetime
    """
    # convert ts to datetime from float
    df_clean['ts'] = pd.to_datetime(df_clean['ts'], unit='s').astype('datetime64[s]')
    
    # create a column for the days of the week using the ts column
    df_clean['day_name'] = df_clean['ts'].dt.day_name()
    df_clean['day_number'] = pd.DatetimeIndex(df_clean['ts']).day
    
    # create a column for the months of the year using the ts column
    df_clean['month'] = pd.DatetimeIndex(df_clean['ts']).month

    # convert values to date time and then month names
    df_clean['month'] = pd.to_datetime(df_clean['month'], format='%m').dt.month_name()
    
    # create a column for the type of the weekday using the ts column
    df_clean['day_type'] = df_clean.ts.dt.weekday.apply(
    lambda x: 'Weekday' if x < 5 else 'Weekend')
    
    # create a column for the hour of the day using the ts column
    df_clean['time']= df_clean['ts'].dt.strftime('%H')
    
    # create a column for the parts of the day
    df_clean['dayparts'] = (df_clean['ts'].dt.hour % 24 + 4) // 4
    df_clean['dayparts'].replace({1: 'Late Night',
                      2: 'Early Morning',
                      3: 'Morning',
                      4: 'Afternoon',
                      5: 'Evening',
                      6: 'Night'}, inplace=True)
    # drop ts column
    df_clean.drop('ts', axis=1, inplace=True) 

    
    return df_clean

In [11]:
def return_attachments(txt):
    """this function is applied to column attachments to extract links
    """
    try:
        dictionary = (txt)[0]
        if 'original_url' in dictionary:
            return dictionary.get('original_url', 'None')
    except:
        return 'None'
    
df_clean['attachments'] = df_clean['attachments'].apply(return_attachments)
#df_clean.to_csv(r'../csv/links.csv', columns = header, index = False)

In [12]:
def real_name(x):
    """this function is applied to column user_profile to extract real_name
    """
    if x != x:
        return 'noname'
    else:
        return x['real_name']

    
df_clean['real_name'] = df_clean['user_profile'].apply(real_name)

# drop user_profile column
df_clean.drop('user_profile', axis=1, inplace=True)

In [13]:
def reactions_count(txt):
    """this function is applied to column reactions to count reactions
    """
    try:
        dictionary = eval(txt)[0]
        if 'reactions' in dictionary:
            return dictionary.get('reactions', 'None')
    except:
        return 'None'
    
df_clean['reactions_count'] = df_clean['reactions'].apply(reactions_count)

In [14]:
def reactions_name(txt):
    """this function is applied to column reactions to count them
    """
    
    try:
        dictionary = eval(txt)[0]
        if 'name' in dictionary:
            return dictionary.get('name', 'None')
    except:
        return 'None'

df_clean['reactions_name'] = df_clean['reactions'].apply(reactions_name)

In [15]:
def boolean_features(df_clean):
    """this function is applied to create a new column with boolean features
    """
    
    # create a new boolean column if comment has reaction
    df_clean['reaction_true'] = df_clean['reactions_name'].isna()

    # create a new boolean column if comment has reply
    df_clean['replies_true'] = df_clean['reply_count'].isna()

    # create a new boolean column if comment has attachments
    df_clean['attachments_true'] = df_clean['attachments'].isna()
    
    return df_clean

In [16]:
def type_of_participant(s):
    """this function is applied to create a new column with teaching and students
    """
    if s == 'siand the LT (she/her)':
        return 'teacher'
    if s ==  'Florian Titze':
        return 'teacher'
    if s ==  'Kosta':
        return 'teacher'
    else:
        return 'student'
    return ''

# apply
df_clean['participant'] = df_clean['real_name'].apply(type_of_participant)

In [17]:
def text_length(df_clean):
    """this function is applied to create a new column with text length
    """
    df_clean['text_length'] = df_clean['text'].astype(str).map(len)
    
    return df_clean

In [18]:
df_clean = clean_dataframe(df_clean)
df_clean = datetime_wrangling(df_clean)
df_clean = boolean_features(df_clean)
df_clean = text_length(df_clean)

In [19]:
def clean_post_feature_eng(df_clean):
    
   # droppig unneccessary columns
    df_clean.drop(['reactions', 'reply_users', 'replies'], axis=1, inplace=True)
    
    # replace None values with zero
    df_clean['reply_count'] = df_clean['reply_count'].fillna(0)
    df_clean['reply_users_count'] = df_clean['reply_users_count'].fillna(0)
    df_clean['reply_count'] = df_clean['reply_count'].astype(int)
    df_clean['reply_users_count'] = df_clean['reply_users_count'].astype(int)
    
    # reordering columns
    df_clean = df_clean[['channel_name', 'user', 'real_name', 'participant',
                     'text', 'text_length', 'reply_count', 'reply_users_count',
                     'replies_true', 'day_name', 'day_type', 'time',
                     'dayparts', 'day_number', 'month', 'reactions_count', 
                     'reactions_name', 'attachments', 'attachments_true', 'reaction_true']]
    
    return df_clean

In [20]:
df_clean = clean_post_feature_eng(df_clean)


In [21]:
df_clean.to_csv(r'../csv/data_clean_optimized.csv', index = False)
