# CSV Converter

## Auto-import

In [189]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import libraries

In [190]:
from html_cleaner import remove_html_tags_from_file
from os import listdir, mkdir
from os.path import exists, isfile, join

import numpy as np
import pandas as pd

## Functions

### Fetch list of file names from a certain directory

In [191]:
def get_files_from_dir_path(dir_path):
    files = []
    
    for file in listdir(dir_path):
        if file.endswith('.eml'):
            files.append(file)

    return files

### Convert all files to text

In [163]:
def convert_all_files_to_text(base_src_dir_path):

    src_files = get_files_from_dir_path(base_src_dir_path)
    
    text_array = []
    id_array = []

    for i in range(0, len(src_files)):
        current_src_file = src_files[i]
        current_id = current_src_file.split('.')[0]
        current_id = current_id.split('_')[-1]
        id_array.append(current_id)
        text_array.append(remove_html_tags_from_file('{}{}'.format(base_src_dir_path, current_src_file)))
        
    return ((text_array, id_array))

### Create train dataset from text

In [195]:
def create_train_data_dataframe(train_id_array, train_text_array):
    train_label_array = pd.read_csv('dataset/spam-mail.tr.label')
    
    prediction_array = []
    
    for i in range(len(train_id_array)):
        current_train_data = []
        
        prediction_array.append(train_label_array[train_label_array['Id'] == int(train_id_array[i])]['Prediction'].values[0])
        
    dataset = pd.DataFrame({'id': train_id_array, 'prediction': prediction_array, 'content': train_text_array})
    
    return dataset

### Create test dataset from text

In [196]:
def create_test_data_dataframe(test_id_array, test_text_array):
    
    dataset = pd.DataFrame({'id': test_id_array, 'content': test_text_array})
    
    return dataset

## Main program

### Train data

#### Getting train data from raw files

In [193]:
(train_text_array, train_id_array) = convert_all_files_to_text('dataset/extracted/TR/')

#### Creating dataset from raw train data

In [194]:
dataset = create_train_data_dataframe(train_id_array, train_text_array)

dataset.to_csv('dataset/train_data.csv', encoding='utf-8', index=False)

### Test data

#### Getting test data from raw files

In [197]:
(test_text_array, test_id_array) = convert_all_files_to_text('dataset/extracted/TT/')

#### Creating dataset from raw test data

In [198]:
dataset = create_test_data_dataframe(test_id_array, test_text_array)

dataset.to_csv('dataset/test_data.csv', encoding='utf-8', index=False)