# Preprocessing

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from library.sb_utils import save_file

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype=np.int):


As always, I'll start by loading the data.

In [2]:
data = pd.read_csv("../data/clean_data.csv")

In [3]:
data.head(3)

Unnamed: 0,salary_range,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,function,fraudulent,text
0,0,0,1,0,Other,Internship,Unspecified,Marketing,0,marketing intern u ny new york we re food52 we...
1,0,0,1,0,Full-time,Not Applicable,Unspecified,Customer Service,0,customer service cloud video production nz auc...
2,0,0,1,0,Other,Not Applicable,Unspecified,Other,0,commissioning machinery assistant cma u ia wev...


As nice as it was to look at all the numerical features in during EDA, those features are not going to be needed in this project since I am dealing with text data. Perhpas if I identified clear correlations between the numerical data and the fraudulent classification, I would consider using the features.

In [4]:
data.drop(['salary_range', 'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
          'required_experience', 'required_education', 'function'], axis=1, inplace=True)

## Text Processing

Now all there is left to is deal with all the text data. I have already consolidated the text data to one column, converted everything to lowercase and removed all the stop words.

In [5]:
vectorizer = TfidfVectorizer(max_features=1000)

In [6]:
vectorizer.fit(data['text'])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [7]:
X = vectorizer.fit_transform(data['text']).toarray()
y = data['fraudulent']

In [8]:
X = pd.DataFrame(X, columns=vectorizer.get_feature_names())

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=1)

In [10]:
X_train

Unnamed: 0,000,10,100,12,15,1500,16,18,20,200,...,worldwide,would,write,writing,written,year,york,you,young,your
8877,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.071274,0.052901,0.000000,0.000000,0.000000,0.0,0.000000
12224,0.000000,0.040326,0.042723,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.047726,0.000000,0.029649,0.000000,0.042319,0.000000,0.0,0.000000
2366,0.000000,0.083083,0.000000,0.000000,0.0,0.201461,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.038635,0.000000,0.000000,0.0,0.000000
8034,0.000000,0.079981,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1960,0.094315,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.102434,...,0.0,0.0,0.052560,0.043992,0.000000,0.020652,0.000000,0.028417,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.093602,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.064803,0.000000,0.000000,0.000000,0.041859,0.0,0.000000
17289,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.058498,0.000000,0.160986,0.0,0.000000
5192,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.136774
12172,0.000000,0.000000,0.000000,0.000000,0.0,0.080694,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


## Save Data


In [11]:
# save the data to a new csv file
datapath = '../data'
save_file(data, 'preprocessed_data.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data\preprocessed_data.csv"


In [12]:
save_file(X_test, 'X_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data\X_test.csv"


In [13]:
save_file(X_train, 'X_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data\X_train.csv"


In [14]:
save_file(y_train, 'y_train.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data\y_train.csv"


  data.to_csv(fpath, index=False)


In [15]:
save_file(y_test, 'y_test.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data\y_test.csv"
