# Validate Processed DF
**Author:** Jim Maddock

**Last Updated:** 5-7-20

**Description:** Validate the dataset processed with `clean_dataset_4-21-20.ipynb` against the raw dataset.

In [6]:
import pandas as pd
import numpy as np
import uuid
import logging
import os
import random

import matplotlib.pyplot as plt
import matplotlib

matplotlib.style.use('ggplot')

In [43]:
# import dataset from csv
BASE_FILE_PATH = '/srv/aft/processed/chunks/'

dtypes = {
    'aft_id':object
}

file_list = os.listdir(BASE_FILE_PATH)
processed_df = pd.DataFrame()
for file in file_list:
    file_path = os.path.join(BASE_FILE_PATH,file)
    chunk = pd.read_csv(file_path,escapechar='\\', encoding='latin-1', dtype=dtypes)
    processed_df = processed_df.append(chunk)


In [44]:
RAW_FILE_PATH = '/srv/aft/raw/dump_03-24-20.csv'
raw_df = pd.read_csv(RAW_FILE_PATH,escapechar='\\', encoding='latin-1', dtype=dtypes)

# remove all comments that do not have a helpful or unhelpful label
raw_df = raw_df.loc[(raw_df['aft_helpful'] > 0) | (raw_df['aft_unhelpful'] > 0)]

# generate a unique ID
# make sure to use a random seed so this is reproducable
rd = random.Random()
rd.seed(0)
raw_df['UUID'] = raw_df.apply(lambda x: str(uuid.UUID(int=rd.getrandbits(128))), axis=1)

In [45]:
print('Number of raw rows: {0}'.format(len(raw_df)))
print('Number of processed rows: {0}'.format(len(processed_df)))

Number of raw rows: 114984
Number of processed rows: 114983


In [47]:
print('UUIDs not in processed_df: {0}'.format(len(raw_df.loc[~raw_df['UUID'].isin(processed_df['UUID'])])))
print('UUIDs not in raw_df: {0}'.format(len(processed_df.loc[~processed_df['UUID'].isin(raw_df['UUID'])])))
print('overlap: {0}'.format(len(processed_df.loc[processed_df['UUID'].isin(raw_df['UUID'])])))

UUIDs not in processed_df: 36
UUIDs not in raw_df: 35
overlap: 114948


* UUIDs not in processed DF seem to be caused by uncaught escape chars in comments, specifically at the end of strings

In [50]:
print('number of unique raw UUIDs: {0}'.format(raw_df['UUID'].nunique()))
print('number of unique processed UUIDs: {0}'.format(processed_df['UUID'].nunique()))

number of unique raw UUIDs: 114984
number of unique processed UUIDs: 114948


In [36]:
raw_df.loc[raw_df['UUID'] == 'e3e70682-c209-4cac-629f-6fbed82c07cd']

Unnamed: 0,aft_id,aft_page,aft_page_revision,aft_user,aft_user_text,aft_user_token,aft_form,aft_cta,aft_link,aft_rating,...,aft_archive,aft_archive_date,aft_helpful,aft_unhelpful,aft_has_comment,aft_net_helpful,aft_relevance_score,aft_discuss,aft_claimed_user,UUID
1,04f8e0fdffaf1e9b25a890b11c27a364,3235587,543382932,0,216.38.130.162,x6riNeCDobHCPn2XXciJq7x4xvA0KmpA,6,4,X,1,...,0,N,3,0,1,3,53,N,0,e3e70682-c209-4cac-629f-6fbed82c07cd


In [59]:
processed_df.groupby('UUID').size().to_frame('count').sort_values('count',ascending=False)

AttributeError: 'DataFrame' object has no attribute 'to_frame'

In [65]:
processed_df.loc[processed_df['UUID'].isnull()]['']

93    051184453363dae40bd7842b2b77d26b
56    11899500000000000000000000000000
19    16052100000000000000000000000000
28    16178800000000000000000000000000
60    16197000000000000000000000000000
52    16821600000000000000000000000000
19    18615000000000000000000000000000
79    21037400000000000000000000000000
79    21444600000000000000000000000000
59    22491600000000000000000000000000
76    22583200000000000000000000000000
46    26681400000000000000000000000000
5     28754500000000000000000000000000
98    29373700000000000000000000000000
19    33539100000000000000000000000000
9     33580500000000000000000000000000
95    35628500000000000000000000000000
15    37543100000000000000000000000000
99    38273900000000000000000000000000
12    39461400000000000000000000000000
91    41527900000000000000000000000000
20    42029700000000000000000000000000
39    43140000000000000000000000000000
86    43338200000000000000000000000000
59    54040800000000000000000000000000
42    5476110000000000000

In [83]:
raw_df.loc[raw_df['aft_id'] == '84455100000000000000000000000000']['aft_comment'].tolist()[0]

'communication sector in short news\n\\'

In [82]:
processed_df.loc[processed_df['aft_id'] == '84455100000000000000000000000000']['aft_comment']

24    communication sector in short news\n",20130123...
Name: aft_comment, dtype: object