In [1]:
import pandas as pd
import pprint, os, chardet

In [2]:
# Define input files directory and output file name
input_dir_name = "raw_datasets"
output_file = "SA.csv"
os.makedirs(input_dir_name, exist_ok=True)

if not os.path.exists(output_file):
    with open(output_file, "w") as f:
        # f.write("text,sentiment\n")
        f.write("")

This script location is in the /data directory of the project. It assumes the following data directory structure:

```
Skynet
| - data
|    | - [input_dir_name]
|    | - clean-csv.ipynb
|    | - [output_file]
```

In [3]:
# Use all files in script directory as input
inputs = []
input_dir_name = os.path.join(os.path.abspath(""), input_dir_name)
for filename in os.listdir(input_dir_name):
    filepath = os.path.join(os.path.abspath(input_dir_name), filename)
    if os.path.isfile(filepath):
        inputs.append(filepath)

# Validate input
pprint.pprint(inputs)

['/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/yelp_train.csv',
 '/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/test.csv',
 '/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/IMDB Dataset.csv',
 '/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/train.csv',
 '/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/testing.csv',
 '/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/yelp_test.csv']


In [4]:
# WARNING: Add column names to the beginning of the csv files before running this snippet
# Each file must have the 2 required columns review/text and label
dfs = pd.DataFrame()
for filepath in inputs:
    # Detect Encoding using chardet Library
    with open(filepath, "rb") as f:
        data = f.read()
    encoding = chardet.detect(data)["encoding"]
    print(filepath, "encoding:", encoding)

    df = pd.read_csv(filepath, encoding=encoding)  # Read csv
    print(filepath, "columns:", df.columns.tolist())

    # Rename relevant column names to 'text' and 'label'
    if "review" in df:
        df.rename(columns={"review": "text"}, inplace=True)
    elif "tweet content" in df:
        df.rename(columns={"tweet content": "text"}, inplace=True)
    df.rename(columns={"sentiment": "label"}, inplace=True)

    print(filepath, "columns after cleaning:", df.columns.tolist())
    df = df[["text", "label"]]  # Keep only relevant columns

    # Remove all NA values
    df.dropna(inplace=True)  # Drop NAs

    # Preview unique values in dataframe
    print(filepath, "label unique values:", df["label"].unique())

    # Add df into big dataframe
    dfs = pd.concat([dfs, df])

/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/yelp_train.csv encoding: ascii
/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/yelp_train.csv columns: ['sentiment', 'review']
/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/yelp_train.csv columns after cleaning: ['label', 'text']
/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/yelp_train.csv label unique values: [1 2]
/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/test.csv encoding: utf-8
/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/test.csv columns: ['textID', 'text', 'sentiment', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km�)', 'Density (P/Km�)']
/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/test.csv columns after cleaning: ['textID', 'text', 'label', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km�)', 'Density (P/Km�)']
/Users/kevinhsu/Downloads/AIT 526/Skynet/data/raw_datasets/test.csv lab

In [20]:
# Review return dataset
dfs

Unnamed: 0,text,label
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive
...,...,...
14,Exceeded my expectations!,positive
15,Not what I expected.,negative
16,Will definitely buy again!,positive
17,Very bad experience.,negative


In [5]:
# Re-encode sentiment to the integers. Also drops irrelevant data
#  0 - negative
#  1 - neutral
#  2 - positive

# Update this map using the unique values of each dataset from the result obtained above
sent_map = {
    "positive": 2,
    "Positive": 2,
    "neutral": 1,
    "Neutral": 1,
    "negative": 0,
    "Negative": 0,
    1: 1,
    2: 2,
}


def reencode_sentiment(sentiment):
    return sent_map[sentiment]


ret_df = dfs.copy()
ret_df["label"] = ret_df["label"].map(sent_map)

print("Unique values of label column:", ret_df["label"].unique())
print("Count of values in label column:")
print(ret_df["label"].value_counts)

ret_df.dropna(inplace=True)
ret_df["label"] = ret_df["label"].astype(int)  # Coerced float values into int

Unique values of label column: [ 1.  2.  0. nan]
Count of values in label column:
<bound method IndexOpsMixin.value_counts of 0        1.0
1        2.0
2        1.0
3        1.0
4        2.0
        ... 
37995    1.0
37996    2.0
37997    1.0
37998    1.0
37999    1.0
Name: label, Length: 679033, dtype: float64>


In [22]:
# Review return dataset
ret_df

Unnamed: 0,text,label
0,Last session of the day http://twitpic.com/67ezh,1
1,Shanghai is also really exciting (precisely -...,2
2,"Recession hit Veronique Branquinho, she has to...",0
3,happy bday!,2
4,http://twitpic.com/4w75p - I like it!!,2
...,...,...
14,Exceeded my expectations!,2
15,Not what I expected.,0
16,Will definitely buy again!,2
17,Very bad experience.,0


In [23]:
print(
    "Unique values of label column after removing irrelevant values:",
    ret_df["label"].unique(),
)
print("Count of values in label column after removing irrelevant values:")
print(ret_df["label"].value_counts)

Unique values of label column after removing irrelevant values: [1 2 0]
Count of values in label column after removing irrelevant values:
<bound method IndexOpsMixin.value_counts of 0     1
1     2
2     0
3     2
4     2
     ..
14    2
15    0
16    2
17    0
18    2
Name: label, Length: 81032, dtype: int64>


In [6]:
# Write dataframe to output csv file
ret_df.to_csv(output_file, index=False)