# Data Exploration
Start by exploring the provided dataset (emails.mat). Create two datasets having different features.


### (1) Data Exploration

In [1]:
# Install required packages

%%capture
%pip install pandas scipy matplotlib numpy scikit-learn

In [1]:
#Import required packages

import scipy.io
import scipy.sparse as sp
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
# Load the dataset
email_data = scipy.io.loadmat('emails.mat')
email_data

{'__header__': b'MATLAB 5.0 MAT-file, Platform: PCWIN, Created on: Tue Dec 07 13:57:47 2010',
 '__version__': '1.0',
 '__globals__': [],
 'X': <57173x10000 sparse matrix of type '<class 'numpy.float64'>'
 	with 2351499 stored elements in Compressed Sparse Column format>,
 'Y': array([[ 1, -1,  1, ...,  1,  1,  1]], dtype=int16)}

In [3]:
# Convert X into a dense matrix
sparse_data = email_data['X']
dense_data = sparse_data.toarray()
emails_df = pd.DataFrame(dense_data)
emails_df = emails_df.transpose()
emails_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57163,57164,57165,57166,57167,57168,57169,57170,57171,57172
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Check if there are Empty columns (All zeros)
zero_columns = emails_df.columns[(emails_df == 0).all()]
zero_columns

Int64Index([], dtype='int64')

In [6]:
# Check for non integer values
check_integer_values = emails_df.applymap(lambda x: isinstance(x, int) or x.is_integer()).all().all()
check_integer_values

True

In [7]:
# Check for negative values
negative_values = emails_df[(emails_df < 0).any()]
negative_values

  negative_values = emails_df[(emails_df < 0).any()]


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57163,57164,57165,57166,57167,57168,57169,57170,57171,57172


So far the BOW matrix does not have any discrepancies related to the representation i.e. each value is a non negative integer, and all words ocur at least once in the whole corpus.

In [4]:
# Check the discrepancies in labels
labels = email_data['Y']
label_vals = labels[0]
count_1 = np.count_nonzero(label_vals==1)
count_neg_1 = np.count_nonzero(label_vals==-1)
assert(count_1 + count_neg_1 == len(label_vals))

# Distribution of labels
print("Count of +1: ")
print(count_1)
print("Count of -1: ")
print(count_neg_1)

Count of +1: 
8030
Count of -1: 
1970


### (2) Data Preprocessing

Creating two datasets. First contains BOW representation with top n most frquent and top k least frequent words, which is used to establish a baseline. Second contains the TF-IDF representation of the vanilla BOW matrix.

#### (A) BOW Top-n, Top-k

In [5]:
# Change column names
column_names = [['feature{}'.format(i) for i in range(len(emails_df.columns))]]
emails_df.columns = column_names
emails_df.head(3)

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,...,feature57163,feature57164,feature57165,feature57166,feature57167,feature57168,feature57169,feature57170,feature57171,feature57172
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
frequencies = emails_df.sum()
frequencies

feature0         39.0
feature1          6.0
feature2          8.0
feature3         69.0
feature4        210.0
                ...  
feature57168     26.0
feature57169      8.0
feature57170     10.0
feature57171      5.0
feature57172      4.0
Length: 57173, dtype: float64

In [10]:
#Save frequencies for inferencing
frequencies.to_csv('freq.csv')

In [8]:
# Select top-n most frequent words and top-k least frequent terms
n = 5000
k = 5000

top_n = frequencies.sort_values(ascending=False).index[:n]
top_k = frequencies.sort_values(ascending=True).index[:k]

selected_cols = list(top_n) + list(top_k)
top_n_k_df = emails_df[selected_cols]
#Shuffle columns
top_n_k_df = top_n_k_df.sample(frac=1, axis=1)
top_n_k_df.head(3)


Unnamed: 0,feature17361,feature28910,feature36462,feature23854,feature7709,feature6663,feature14218,feature6254,feature10277,feature53420,...,feature38497,feature10500,feature32926,feature9149,feature10062,feature44147,feature41836,feature29764,feature47211,feature57017
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Add labels
top_n_k_df['labels'] = label_vals
top_n_k_df.head(3)

Unnamed: 0,feature17361,feature28910,feature36462,feature23854,feature7709,feature6663,feature14218,feature6254,feature10277,feature53420,...,feature10500,feature32926,feature9149,feature10062,feature44147,feature41836,feature29764,feature47211,feature57017,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [10]:
# Split into Training and Test Set
train_n_k_df, test_n_k_df = train_test_split(top_n_k_df, test_size=0.2, random_state=42)

In [11]:
# Check distribution in both the datasets
print("Distribution of labels in Training Set :")
train_distribution = train_n_k_df[train_n_k_df.columns[-1]].value_counts()
print(train_distribution)
print("Ratio :")
print(train_distribution.iloc[0]/train_distribution.iloc[1])
print("\n")
print("Distribution of labels in Test Set :")
test_distribution = test_n_k_df[test_n_k_df.columns[-1]].value_counts()
print(test_distribution)
print("Ratio :")
print(test_distribution.iloc[0]/test_distribution.iloc[1])

Distribution of labels in Training Set :
 1    6406
-1    1594
Name: (labels,), dtype: int64
Ratio :
4.018820577164367


Distribution of labels in Test Set :
 1    1624
-1     376
Name: (labels,), dtype: int64
Ratio :
4.319148936170213


In [12]:
# Save the datasets
sparse_train_df = sp.csr_matrix(train_n_k_df.values)
sparse_test_df = sp.csr_matrix(test_n_k_df.values)

In [13]:
sparse_train_data = sparse_train_df[:, :-1]  # Exclude the label column
sparse_train_labels = sparse_train_df[:, -1]  # Get the label column

scipy.io.savemat('datasets/top_nk_train.mat', {'X': sparse_train_data, 'Y': sparse_train_labels}, do_compression=True)


In [14]:
sparse_test_data = sparse_test_df[:, :-1]  # Exclude the label column
sparse_test_labels = sparse_test_df[:, -1]  # Get the label column

scipy.io.savemat('datasets/top_nk_test.mat', {'X': sparse_test_data, 'Y': sparse_test_labels}, do_compression=True)


#### (B) TF-IDF Representation

In [7]:
# Fit and transform BoW matrix
tfidf_transformer = TfidfTransformer(use_idf=True, smooth_idf=True)
tfidf_matrix = tfidf_transformer.fit_transform(emails_df)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=emails_df.columns)

In [8]:
# Select top p values
p = 5000
importance = tfidf_df.sum(axis=0)
importance

feature0        2.531674
feature1        0.682883
feature2        0.450685
feature3        2.349915
feature4        1.839469
                  ...   
feature57168    0.257994
feature57169    0.117439
feature57170    0.176885
feature57171    0.080750
feature57172    0.180277
Length: 57173, dtype: float64

In [13]:
top_p = importance.sort_values(ascending=False)
top_p = top_p[:p].index.tolist()
top_p_df = tfidf_df[top_p]
#Shuffle columns
top_p_df = top_p_df.sample(frac=1, axis=1)

In [14]:
# Add labels
top_p_df['labels'] = label_vals
top_p_df.head(3)

Unnamed: 0,feature13066,feature31536,feature30123,feature53227,feature34173,feature34008,feature3089,feature45032,feature12865,feature1387,...,feature2519,feature49561,feature43468,feature17114,feature23528,feature49209,feature45370,feature31291,feature1061,labels
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.049603,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04426,0.0,-1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [15]:
# Split into Training and Test Set
train_df, test_df = train_test_split(top_p_df, test_size=0.2, random_state=42)

In [16]:
# Check distribution in both the datasets
print("Distribution of labels in Training Set :")
train_distribution = train_df[train_df.columns[-1]].value_counts()
print(train_distribution)
print("Ratio :")
print(train_distribution.iloc[0]/train_distribution.iloc[1])
print("\n")
print("Distribution of labels in Test Set :")
test_distribution = test_df[test_df.columns[-1]].value_counts()
print(test_distribution)
print("Ratio :")
print(test_distribution.iloc[0]/test_distribution.iloc[1])

Distribution of labels in Training Set :
 1    6406
-1    1594
Name: (labels,), dtype: int64
Ratio :
4.018820577164367


Distribution of labels in Test Set :
 1    1624
-1     376
Name: (labels,), dtype: int64
Ratio :
4.319148936170213


In [20]:
# Convert into Sparse matrix again and save
sparse_train_df = sp.csr_matrix(train_df.values)
sparse_test_df = sp.csr_matrix(test_df.values)

In [21]:
sparse_train_data = sparse_train_df[:, :-1]  # Exclude the label column
sparse_train_labels = sparse_train_df[:, -1]  # Get the label column

scipy.io.savemat('datasets/train_df.mat', {'X': sparse_train_data, 'Y': sparse_train_labels}, do_compression=True)


In [22]:
sparse_test_data = sparse_test_df[:, :-1]  # Exclude the label column
sparse_test_labels = sparse_test_df[:, -1]  # Get the label column

scipy.io.savemat('datasets/test_df.mat', {'X': sparse_test_data, 'Y': sparse_test_labels}, do_compression=True)
