In [3]:
import os
import pandas as pd
pd.set_option('max_columns', 500)
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from functools import reduce
en_stopwords = set(stopwords.words('english')) 
import itertools
import time
import re
import numpy as np
from collections import Counter

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
my_folder="s3://trident-retention-output/"
folder = 's3://trident-retention-data/askunum/'
output_dir=os.path.join(os.getcwd(),'outputs')

In [7]:
# x = pd.read_csv(folder + 'askunum_2022_1.csv', nrows=5)
# for i in x.loc[x.ParentId=='5003x00002GuQ6tAAF'].sort_values('MessageDate', ascending=False)['TextBody']: 
#     print('-'*200)
#     print(i)

## id counts and issue durations

#### load the data, process into issue durations and counts
- data format for each year is a bit different. Thus, we standardize and then apply the function to obtain counts and durations. 

In [8]:
def load_askunum_df(folder, year, usecols=None, nrows=None): 
    if year == 2018: # ['ID', 'PARENTID', 'PARENT.CREATEDDATE', 'PARENT.CLOSEDDATE']
        askunum_df = pd.read_csv(folder + 'askunum_2018.csv', encoding='latin-1', usecols=usecols, nrows=nrows)
       
    if year == 2019: 
        askunum_df = pd.concat([pd.read_csv(folder + 'askunum_2019_{}.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(1, 4)]) 
        
    if year == 2020:  
        askunum_df = pd.concat([pd.read_csv(folder + 'unnested_2020_{}_customer.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(10)])

    if year == 2021: 
        askunum_df = pd.concat([pd.read_csv(folder + 'unnested_2021_{}_customer.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(10)]) 
        
    if year == 2022: 
        askunum_df = pd.concat([pd.read_csv(folder + 'askunum_2022_{}.csv'.format(i), encoding='latin-1', usecols=usecols, nrows=nrows) for i in range(0, 4)])
        
    return askunum_df

def pipeline_askunum_counts_and_duration(folder, year, usecols=False, parent_id=False, target_columns=False): 
    if target_columns == False: 
        target_columns = ['Id', 'ParentId', 'CreatedDate', 'ClosedDate', 'account_id']
    
    def helper_get_counts_and_duration(askunum_df):
        """_summary_

        Args:
            askunum_df (pd.DataFrame): dataframe with ['Id', 'ParentId', 'account_id', 'CreatedDate', 'ClosedDate'] as columns

        Returns:
            pd.DataFrame with ['account_id', 'year', 'month', 'id_count', 'parent_id_count', 'askunum_days']
        """
        # issue counts by created date
        askunum_df['CreatedDate'] = pd.to_datetime(askunum_df['CreatedDate'])
        askunum_df['year'] = askunum_df.CreatedDate.apply(lambda x: x.year)
        askunum_df['month'] = askunum_df.CreatedDate.apply(lambda x: x.month)
        
        email_counts_by_month = askunum_df.groupby(['account_id', 'year', 'month'])[['Id']].count()
        issue_counts_by_month = askunum_df.drop('Id', axis=1).drop_duplicates().groupby(['account_id', 'year', 'month'])[['ParentId']].count()
        combined_df = email_counts_by_month.join(issue_counts_by_month)
        combined_df.rename({"Id":'askunum_id_count', 'ParentId':'askunum_parentid_count'}, axis=1, inplace=True)
        email_counts_by_month, issue_counts_by_month = None, None
        
        # completed issue durations
        askunum_df = askunum_df.loc[~askunum_df.ClosedDate.isna()]
        askunum_df['ClosedDate'] = pd.to_datetime(askunum_df['ClosedDate'])
        askunum_df['askunum_days'] = (askunum_df['ClosedDate'] - askunum_df['CreatedDate']).apply(lambda x: (x.days * 24 + x.seconds / 3600)/24)
        issue_days_by_month = askunum_df.groupby(['account_id', 'year', 'month'])[['askunum_days']].sum()
        combined_df = combined_df.join(issue_days_by_month, how='outer')
        combined_df[['askunum_id_count', 'askunum_parentid_count', 'askunum_days']].fillna(0, inplace=True)
        
        return combined_df 
    
    if year in [2018, 2019]: 
        if usecols==False: 
            usecols = ['ID', 'PARENTID', 'PARENT.CREATEDDATE', 'PARENT.CLOSEDDATE']
        if parent_id==False: 
            parent_id = 'PARENTID'
     
    if year in [2020, 2021]: 
        if usecols==False: 
            usecols = ['Id', 'ParentId', 'CreatedDate', 'ClosedDate']
        if parent_id==False: 
            parent_id = 'ParentId'
            
    if year in [2022]: 
        if usecols==False: 
            usecols = ['Id', 'ParentId', 'Parent.CreatedDate', 'Parent.ClosedDate']
        if parent_id==False: 
            parent_id = 'ParentId'
            
    askunum_df = load_askunum_df(folder, year, usecols=usecols).rename({parent_id: 'ParentId'}, axis=1) #use ParentId as the standard
    account_mapping = pd.read_csv(folder + '{}ParentAccount.csv'.format(year), usecols=['ParentId', 'Parent.AccountId']).drop_duplicates().dropna()
    askunum_df = pd.merge(askunum_df, account_mapping, on='ParentId')
    cols = ['ParentId' if i == parent_id else i for i in usecols] + ['Parent.AccountId']
    print(target_columns, cols)
    askunum_df = askunum_df.rename(dict(zip(cols, target_columns)), axis=1)
    print(askunum_df.shape)
     
    askunum_features = helper_get_counts_and_duration(askunum_df)
    askunum_features.to_csv(os.path.join(output_dir , 'askunum_issue_count_and_duration_{}.csv'.format(year)))
    print(askunum_features.shape)
    return askunum_df, askunum_features

In [None]:
if False: 
    pipeline_askunum_counts_and_duration(folder, 2018)
    pipeline_askunum_counts_and_duration(folder, 2019)
    pipeline_askunum_counts_and_duration(folder, 2020)
    pipeline_askunum_counts_and_duration(folder, 2021)
x = pipeline_askunum_counts_and_duration(folder, 2022)

In [None]:
# take a look at the df and the features
x[0].head()