In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
df = pd.read_csv('clean_data.csv')

In [3]:
df[df["description"].isnull()]

Unnamed: 0,job_id,title,company_profile,description,Special_Char,telecommuting,has_company_logo,has_questions,Ambiguous Range,missing_salary,...,industry_telecommunications,function_accounting auditing,function_administrative,function_customer service,function_engineering,function_information technology,function_none,function_other,function_sales,fraudulent


In [4]:
df2=pd.read_csv('clean_monster_data.csv')
df2.isnull().any(axis=0)

job_title           True
job_description    False
dtype: bool

In [5]:
df_labeled=df[["description","fraudulent"]].copy()
df_labeled.head()

Unnamed: 0,description,fraudulent
0,food fastgrowing james beard awardwinning onli...,0
1,organised focused vibrant awesomedo passion cu...,0
2,client located houston actively seeking experi...,0
3,company esri environmental systems research in...,0
4,job title itemization review managerlocation f...,0


In [6]:
df_unlabeled=df2[["job_description"]].copy()
df_unlabeled["fraudulent"]=""
df_unlabeled.rename(columns={"job_description":"description"},inplace=True)
df_unlabeled.head()

Unnamed: 0,description,fraudulent
0,teamsoft seeing support specialist join client...,
1,wisconsin state journal seeking flexible motiv...,
2,report job job depuy synthes companies member ...,
3,join altec youre considering career altec inc ...,
4,position id positions state ct city fairfield ...,


### Original
#### Tokenizer

In [7]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

train_indices, test_indices, y_train, y_test = train_test_split(np.arange(len(df_labeled)), df_labeled['fraudulent'],
                                                    stratify=df_labeled['fraudulent'], 
                                                    test_size=0.3,random_state=0)
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(np.array(train_indices).reshape(-1, 1), df_labeled.iloc[train_indices]['fraudulent'])
X_resampled=X_resampled.flatten()

In [8]:
max_fatures = 3000
df_labeled['description']=df_labeled['description'].astype(str)
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df_labeled['description'].values)
X1 = tokenizer.texts_to_sequences(df_labeled['description'].values)
X1 = pad_sequences(X1)

In [9]:
X_train1=X1[X_resampled]
X_test1=X1[test_indices]

In [10]:
classifier_org = LogisticRegression(random_state=0,max_iter=1000)
classifier_org.fit(X_train1,y_resampled)
y_pred1 = classifier_org.predict(X_test1)
print(confusion_matrix(y_test,y_pred1))
print(f1_score(y_test,y_pred1))
print(accuracy_score(y_test,y_pred1))

[[3429 1675]
 [ 110  150]]
0.14388489208633093
0.6672259507829977


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [11]:
classifier_org_rf = RandomForestClassifier(random_state=0,n_estimators=1000)
classifier_org_rf.fit(X_train1,y_resampled)
y_pred2 = classifier_org_rf.predict(X_test1)
print(confusion_matrix(y_test,y_pred2))
print(f1_score(y_test,y_pred2))
print(accuracy_score(y_test,y_pred2))

[[5100    4]
 [ 146  114]]
0.6031746031746031
0.9720357941834452


#### Verctorizer

In [12]:
vectorizer = CountVectorizer(max_features=3000,ngram_range=(1,1))
X=vectorizer.fit_transform(df_labeled["description"])
X_train2 = X[X_resampled]
X_test2= X[test_indices]

In [13]:
classifier_org_vec = LogisticRegression(random_state=0,max_iter=3000)
classifier_org_vec.fit(X_train2,y_resampled)
y_pred3 = classifier_org_vec.predict(X_test2)
print(confusion_matrix(y_test,y_pred3))
print(f1_score(y_test,y_pred3))
print(accuracy_score(y_test,y_pred3))

[[4932  172]
 [  73  187]]
0.6042003231017771
0.9543251304996272


In [14]:
classifier_org_vec_rf = RandomForestClassifier(random_state=0,n_estimators=1000)
classifier_org_vec_rf.fit(X_train2,y_resampled)
y_pred4 = classifier_org_vec_rf.predict(X_test2)
print(confusion_matrix(y_test,y_pred4))
print(f1_score(y_test,y_pred4))
print(accuracy_score(y_test,y_pred4))

[[5097    7]
 [ 105  155]]
0.7345971563981043
0.9791200596569725


### Semi-supervised

The random forest performs better, we will select random forest to pseudo label the unlabeled data

#### Tokenizer

In [15]:
combined=pd.concat([df_labeled,df_unlabeled],axis=0)

In [16]:
max_fatures = 3000
combined['description']=combined['description'].astype(str)
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(combined['description'].values)
X_c = tokenizer.texts_to_sequences(combined['description'].values)
X_c = pad_sequences(X_c)

In [17]:
X_c_unlabeled=X_c[17880:]

In [18]:
classifier_c_rf = RandomForestClassifier(random_state=0)
classifier_c_rf.fit(X_c[X_resampled],y_resampled)
pseudo_1 = classifier_c_rf.predict_proba(X_c_unlabeled)[:,1]

In [19]:
Pseudo_label1=np.where(pseudo_1> 0.5, 1, 0)
df_unlabeled["fraudulent"]=Pseudo_label1
df_unlabeled_fake=df_unlabeled[df_unlabeled["fraudulent"]==1]

In [20]:
df_semi=pd.concat([df_labeled,df_unlabeled_fake])

In [21]:
df_unlabeled_fake

Unnamed: 0,description,fraudulent
7245,job description,1
12357,inlinepaneldisplay block important paddingtoppx,1


#### Vectorizer

In [22]:
vectorizer2 = CountVectorizer(max_features=3000,ngram_range=(1,1))
X_c2=vectorizer2.fit_transform(combined["description"])
X_c2_unlabeled=X_c[17880:]

In [23]:
classifier_c2_rf = RandomForestClassifier(random_state=0)
classifier_c2_rf.fit(X_c[X_resampled],y_resampled)
pseudo_2 = classifier_c2_rf.predict_proba(X_c_unlabeled)[:,1]

In [24]:
Pseudo_label2=np.where(pseudo_2> 0.5, 1, 0)
df_unlabeled["fraudulent"]=Pseudo_label2
df_unlabeled_fake2=df_unlabeled[df_unlabeled["fraudulent"]==1]

In [25]:
df_unlabeled_fake2

Unnamed: 0,description,fraudulent
7245,job description,1
12357,inlinepaneldisplay block important paddingtoppx,1


The vectorizer find the same fake posting

### Run on combined data

#### Tokenizer

In [26]:
df=pd.concat([df_labeled,df_unlabeled_fake],axis=0)

In [27]:
max_fatures = 3000
df['description']=df['description'].astype(str)
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(combined['description'].values)
X_f = tokenizer.texts_to_sequences(combined['description'].values)
X_f = pad_sequences(X_f)

In [28]:
X_train_f=np.concatenate((X_f[X_resampled],X_f[-2:]), axis=0)
y_train_f=np.concatenate((y_resampled,df["fraudulent"][-2:]), axis=0)

In [29]:
ros_Tok = RandomOverSampler(random_state=0)
X_resampled_Tok, y_resampled_Tok = ros_Tok.fit_resample(X_train_f, y_train_f)

In [30]:
classifier_f_rf = RandomForestClassifier(random_state=0,n_estimators=1000)
classifier_f_rf.fit(X_resampled_Tok,y_resampled_Tok)
pred_new= classifier_f_rf.predict(X_f[test_indices])

In [31]:
print(confusion_matrix(y_test,pred_new))
print(f1_score(y_test,pred_new))
print(accuracy_score(y_test,pred_new))

[[5101    3]
 [ 148  112]]
0.5973333333333334
0.9718493661446681


#### Vectorizer

In [32]:
df2=pd.concat([df_labeled,df_unlabeled_fake2],axis=0)

In [33]:
vectorizer3 = CountVectorizer(max_features=3000,ngram_range=(1,1))
vectorizer3.fit(df2["description"])
df_train=df2["description"][train_indices]
df_unlabeled_train=df2["description"][17880:]
X_f2=vectorizer3.transform(df_train.append(df_unlabeled_train))
y_f2=df2["fraudulent"][train_indices].append(df2["fraudulent"][17880:])

In [34]:
ros_Vec = RandomOverSampler(random_state=0)
X_resampled_Vec, y_resampled_Vec = ros_Vec.fit_resample(X_f2, y_f2)

In [35]:
X_test=vectorizer3.transform(df2["description"][test_indices])
y_test=df2["fraudulent"][test_indices]

In [36]:
classifier_f_rf2 = RandomForestClassifier(random_state=0,n_estimators=1000)
classifier_f_rf2.fit(X_resampled_Vec,y_resampled_Vec)
pred_new2= classifier_f_rf2.predict(X_test)
print(confusion_matrix(y_test,pred_new2))
print(f1_score(y_test,pred_new2))
print(accuracy_score(y_test,pred_new2))

[[5096    8]
 [ 110  150]]
0.7177033492822966
0.9780014914243103
