# Preprocessing

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from library.sb_utils import save_file

As always, I'll start by loading the data.

In [2]:
data = pd.read_csv("../data/clean_data.csv")

In [3]:
data.head(3)

Unnamed: 0,salary_range,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
0,0,0,1,0,Other,Internship,Unspecified,Marketing,0,marketing intern were food weve created ground...
1,0,0,1,0,Full-time,Not Applicable,Unspecified,Customer Service,0,customer service cloud video production second...
2,0,0,1,0,Other,Not Applicable,Unspecified,Other,0,commissioning machinery assistant cma valor se...


As nice as it was to look at all the numerical features in during EDA, those features are not going to be needed in this project since I am dealing with text data. Perhpas if I identified clear correlations between the numerical data and the fraudulent classification, I would consider using the features.

In [4]:
data.drop(['salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
          'required_experience', 'required_education', 'function'], axis=1, inplace=True)

## Text Processing

Now all there is left to is deal with all the text data. I have already consolidated the text data to one column, converted everything to lowercase and removed all the stop words.

In [5]:
vectorizer = TfidfVectorizer(max_features=800000000000000000000000-pop00000)

In [6]:
vectorizer.fit(data['text'])

TfidfVectorizer(max_features=10000)

In [7]:
X = vectorizer.fit_transform(data['text']).toarray()
y = data['fraudulent']

In [8]:
X = pd.DataFrame(X, columns=vectorizer.get_feature_names())

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [10]:
X_train

Unnamed: 0,aa,aaa,aan,ab,abakus,abap,abc,aberdeen,abilitiesexperience,ability,...,zoekt,zone,zoning,zoopla,zoottle,zopa,zpompano,zu,zweig,zylun
17169,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11170,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4040,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076266,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4491,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1480,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045027,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
y_train = pd.DataFrame(y_train)
y_train.columns = ['fraudulent']
y_test =  pd.DataFrame(y_test)
y_test.columns = ['fraudulent']

## Save Data


In [12]:
# save the data to a new csv file
datapath = '../data'
save_file(data, 'preprocessed_data.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data/preprocessed_data.csv"


In [13]:
save_file(X_test, 'X_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data/X_test.csv"


In [14]:
save_file(X_train, 'X_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data/X_train.csv"


In [15]:
save_file(y_train, 'y_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data/y_train.csv"


In [16]:
save_file(y_test, 'y_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)y
Writing file.  "../data/y_test.csv"
