In [1]:
import pandas as pd
import pprint, os, chardet

In [2]:
# Define input files directory and output file name
input_dir_name = 'raw_datasets'
output_file = 'SA.csv'

This script location is in the /data directory of the project. It assumes the following data directory structure:

```
Skynet
| - data
|    | - [input_dir_name]
|    | - clean-csv.ipynb
|    | - [output_file]
```

In [3]:
# Use all files in script directory as input
inputs = []
input_dir_name = os.path.join(os.path.abspath(""), input_dir_name)
for filename in os.listdir(input_dir_name):
    filepath = os.path.join(os.path.abspath(input_dir_name), filename)
    if os.path.isfile(filepath):
        inputs.append(filepath)

# Validate input
pprint.pprint(inputs)

['d:\\526_term_project\\Skynet\\data\\raw_datasets\\IMDB.csv',
 'd:\\526_term_project\\Skynet\\data\\raw_datasets\\sa_test.csv',
 'd:\\526_term_project\\Skynet\\data\\raw_datasets\\sa_train.csv',
 'd:\\526_term_project\\Skynet\\data\\raw_datasets\\twitter_training.csv',
 'd:\\526_term_project\\Skynet\\data\\raw_datasets\\twitter_validation.csv',
 'd:\\526_term_project\\Skynet\\data\\raw_datasets\\yelp_test.csv',
 'd:\\526_term_project\\Skynet\\data\\raw_datasets\\yelp_train.csv']


In [None]:
# WARNING: Add column names to the beginning of the csv files before running this snippet
# Each file must have the 2 required columns review/text and label
dfs = pd.DataFrame()
for filepath in inputs:
    # Detect Encoding using chardet Library
    with open(filepath, 'rb') as f:
        data = f.read()
    encoding = chardet.detect(data)['encoding']
    print(filepath, "encoding:", encoding)

    df = pd.read_csv(filepath, encoding=encoding) # Read csv
    print(filepath, "columns:", df.columns.tolist())

    # Rename relevant column names to 'text' and 'label'
    if 'review' in df:
        df.rename(columns={'review': 'text'}, inplace=True)
    elif 'tweet content' in df:
        df.rename(columns={'tweet content': 'text'}, inplace=True)
    df.rename(columns={'sentiment': 'label'}, inplace=True)

    print(filepath, "columns after cleaning:", df.columns.tolist())
    df = df[['text', 'label']] # Keep only relevant columns

    # Remove all NA values
    df.dropna(inplace=True) # Drop NAs

    # Preview unique values in dataframe
    print(filepath, "label unique values:", df['label'].unique())

    # Add df into big dataframe
    dfs = pd.concat([dfs, df])

d:\526_term_project\Skynet\data\raw_datasets\IMDB.csv encoding: utf-8
d:\526_term_project\Skynet\data\raw_datasets\IMDB.csv columns: ['review', 'sentiment']
d:\526_term_project\Skynet\data\raw_datasets\IMDB.csv columns after cleaning: ['text', 'sentiment']
d:\526_term_project\Skynet\data\raw_datasets\IMDB.csv sentiment unique values: ['positive' 'negative']
d:\526_term_project\Skynet\data\raw_datasets\sa_test.csv encoding: Windows-1252
d:\526_term_project\Skynet\data\raw_datasets\sa_test.csv columns: ['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
d:\526_term_project\Skynet\data\raw_datasets\sa_test.csv columns after cleaning: ['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
d:\526_term_project\Skynet\data\raw_datasets\sa_test.csv sentiment unique values: ['neutral' 'positive' 'negative']
d:\526_term_project\Skynet\data

In [5]:
# Review return dataset
dfs

Unnamed: 0,text,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
559995,Ryan was as good as everyone on yelp has claim...,2
559996,Professional \nFriendly\nOn time AND affordabl...,2
559997,Phone calls always go to voicemail and message...,1
559998,Looks like all of the good reviews have gone t...,1


In [None]:
# Re-encode sentiment to the integers. Also drops irrelevant data
# -1 - negative
#  0 - neutral
#  1 - positive

# Update this map using the unique values of each dataset from the result obtained above
sent_map = {
    'positive': 1,
    'Positive': 1,
    'neutral': 0,
    'Neutral': 0,
    'negative': -1,
    'Negative': -1,
    1: -1,
    2: 1
}

def reencode_sentiment(sentiment):
    return sent_map[sentiment]

ret_df = dfs.copy()
ret_df['label'] = ret_df['label'].map(sent_map)

print('Unique values of label column:', ret_df['label'].unique())
print('Count of values in label column:')
print(ret_df['label'].value_counts)

ret_df.dropna(inplace=True)
ret_df['label'] = ret_df['label'].astype(int) # Coerced float values into int

Unique values of sentiment column: [ 1. -1.  0. nan]
Count of values in sentiment column:
<bound method IndexOpsMixin.value_counts of 0         1.0
1         1.0
2         1.0
3        -1.0
4         1.0
         ... 
559995    1.0
559996    1.0
559997   -1.0
559998   -1.0
559999    1.0
Name: sentiment, Length: 754010, dtype: float64>


In [None]:
# Review return dataset
ret_df

Unnamed: 0,text,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,-1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
559995,Ryan was as good as everyone on yelp has claim...,1
559996,Professional \nFriendly\nOn time AND affordabl...,1
559997,Phone calls always go to voicemail and message...,-1
559998,Looks like all of the good reviews have gone t...,-1


In [None]:
print('Unique values of label column after removing irrelevant values:', ret_df['label'].unique())
print('Count of values in label column after removing irrelevant values:')
print(ret_df['label'].value_counts)

Unique values of sentiment column after removing irrelevant values: [ 1 -1  0]
Count of values in sentiment column after removing irrelevant values:
<bound method IndexOpsMixin.value_counts of 0         1
1         1
2         1
3        -1
4         1
         ..
559995    1
559996    1
559997   -1
559998   -1
559999    1
Name: sentiment, Length: 740963, dtype: int64>


In [9]:
# Write dataframe to output csv file
ret_df.to_csv(output_file, index=False)