In [1]:
# python version 3.6.0
import pandas as pd
import os
import numpy as np
import re

from datetime import datetime
from sklearn.preprocessing import Imputer, LabelEncoder

# change working directory
data_path = '/Users/jaeyoonjung/Desktop/Data&Data Classification Challenge - Facebook/\
Data&Data Classification Challenge - Facebook - Training Set.csv'

# load the training data
data = pd.read_csv(data_path, delimiter="\t")

print ('The dataset has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
print ('Owner types are {}' .format(set(data['owner_type'])))
print ('Target labels are {}'.format(set(data['INDEX New'])))
print ('Percentage of Fake Seller: {:.2f}%'.format(len(data[data['INDEX New'] == 'Fake Seller']) / \
                                                   data.shape[0] * 100))
print ('Percentage of Reseller: {:.2f}%'.format(len(data[data['INDEX New'] == 'Reseller']) / \
                                                   data.shape[0] * 100))
print ('Percentage of No Seller: {:.2f}%'.format(len(data[data['INDEX New'] == 'No Seller']) / \
                                                   data.shape[0] * 100))

The dataset has 35182 rows and 11 columns
Owner types are {nan, 'user', 'page'}
Target labels are {'No Seller', 'Reseller', 'Fake Seller'}
Percentage of Fake Seller: 26.08%
Percentage of Reseller: 27.24%
Percentage of No Seller: 46.69%


In [2]:
from nltk.stem import SnowballStemmer
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

In [3]:
label = data['INDEX New']
features = data.drop('INDEX New', axis = 1)

print('There are {} rows that are missing profile_picture' \
     .format(len(features[pd.isnull(features.profile_picture)])))
print('There are {} rows that have picture_labels without pictures_url' \
      .format(len(features[pd.isnull(features.pictures_url) & pd.notnull(features.picture_labels)])))

There are 0 rows that are missing profile_picture
There are 0 rows that have picture_labels without pictures_url


In [4]:
features['published_hour'] = features.published_at.apply(
    lambda x: datetime.strptime(x, '%m/%d/%y %I:%M %p').hour if len(x) > 10 else np.nan
)

# features['published_day_of_week'] = features.published_at.apply(
#     lambda x: datetime.strptime(x, '%m/%d/%y %I:%M %p').isoweekday() if len(x) > 10 else \
#     datetime.strptime(x, '%m/%d/%Y').isoweekday()
# )

# word count of description as a new feature
features['description_length'] = features.description.apply(
    lambda x: len(x.split()) if isinstance(x, str) else 0
)

# number of picture labels
features['picture_label_occurrences'] = features.picture_labels.apply(
    lambda x: x.count(',') + 1 if (not isinstance(x, float)) or (isinstance(x, float) and not np.isnan(x))
    else 0
)

# does the post have any hastag?
features['hashtags'] = features.description.apply(
    lambda x: x.count('#') if (not isinstance(x, float)) or (isinstance(x, float) and not np.isnan(x))
    else 0
)

features['punctuations'] = features.description.apply(
    lambda x: x.count('!') if (not isinstance(x, float)) or (isinstance(x, float) and not np.isnan(x))
    else 0
)

# did the writer leave personal contact?
phone = re.compile(r'\s[[0-9]{9,10}|[0-9]{3,4}\s[0-9]{6,7}]\s')
contact_flag = ['call', 'contact', '@', 'whatsapp', 'text', 'message','pm', phone]

features['has_contact'] = features.description.apply(
    lambda x: int(any(bool(re.search(s, re.sub(r'[\-|\+|\(|\)|\.|\,]', '',x.lower()))) for s in contact_flag)) \
    if (not isinstance(x, float)) or (isinstance(x, float) and not np.isnan(x)) \
    else 0 
)

features['uppercase_count'] =  features.description.apply(
    lambda x: (sum(1 for c in x if c.isupper())) + 1 if not isinstance(x, float)
    else 0
)

#add one to differentiate NaN
features['uppercase_ratio'] =  features.description.apply(
    lambda x: (sum(1 for c in x if c.isupper()) + 1) / len(x) if not isinstance(x, float)
    else 0
)


features['has_pic_url'] = features.pictures_url.apply(
    lambda x: 0 if not isinstance(x, float) else 1
)



In [5]:
features = features.drop('profile_picture', axis = 1) \
    .drop('pictures_url', axis = 1) \
    .drop('published_at', axis = 1)


In [None]:
#distribution of numeric features seems very skewed (except for published_hour and published_day_of_week)
features.describe()

In [6]:
#impute nan values in published_hour
hour_imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=1)
hour_imp.fit(features.published_hour.values.reshape(1, -1))
features.published_hour = sum(hour_imp.transform(features.published_hour.values.reshape(1, -1)).tolist(),[])

In [7]:
# one hot encode categorial variable
encoded_owner = pd.get_dummies(features.owner_type)
features = pd.concat([features, encoded_owner], axis=1)

In [8]:
features = features.drop('owner_type', axis= 1) \
    .drop('found_keywords', axis = 1) \
    .drop('picture_labels', axis = 1) 

In [9]:
LE = LabelEncoder()
label = LE.fit_transform(label)

In [10]:
LE.classes_

array(['Fake Seller', 'No Seller', 'Reseller'], dtype=object)

In [None]:
#Split data for cross validation
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size = 0.2, random_state = 0)

# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
#strip punctuation and numbers and extract stem of each feature 
def preprocessor(text):
    #strip punc
    processed = re.sub(r'[#|\!|\-|\+|:|//|\']', "", text)
    #strip number
    processed = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', ' ', processed).strip()
    #consolidate whitespace
    processed = re.sub('[\s]+', ' ', processed).strip()
    processed =  " ".join([SnowballStemmer("english").stem(word) for word in processed.split()])
    return processed

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# pic_label_count = CountVectorizer()
# pic_label_count.fit(X_train.picture_labels.values.astype('U'))
# pic_label_train = pic_label_count.transform(X_train.picture_labels.values.astype('U'))
# pic_label_test = pic_label_count.transform(X_test.picture_labels.values.astype('U'))

# picture_pca = PCA(n_components = 50)
# picture_pca.fit(pic_label_train.toarray())
# decomposed_pic_train = picture_pca.transform(pic_label_train.toarray())
# decomposed_pic_test = picture_pca.transform(pic_label_test.toarray())

In [None]:
# description text
from sklearn.feature_extraction.text import TfidfVectorizer
description_vectorizer = TfidfVectorizer(sublinear_tf=True, 
                                         #stop_words='english', 
                                         ngram_range=(1, 2), \
                                         preprocessor = preprocessor)
description_vectorizer.fit(X_train.description.values.astype('U'))
tfidf_train = description_vectorizer.transform(X_train.description.values.astype('U'))

In [None]:
print ("The sparse document matrix has {} features".format(tfidf_train.shape[1]))

In [None]:
#dimension reduction: LSA
tfidf_lsa = TruncatedSVD(n_components=150)
reduced_tfidf_train = tfidf_lsa.fit_transform(tfidf_train)
tfidf_test = description_vectorizer.transform(X_test.description.values.astype('U'))
reduced_tfidf_test = tfidf_lsa.transform(tfidf_test)

In [None]:
decomposed_X_train = X_train.drop('description', axis = 1)
decomposed_X_test = X_test.drop('description', axis = 1)

In [None]:
feature_train = np.hstack((decomposed_X_train, reduced_tfidf_train))
feature_test = np.hstack((decomposed_X_test, reduced_tfidf_test))

In [None]:
# Initialize a normalizer, then apply it to the features
feature_scaler = MinMaxScaler()
feature_scaler.fit(feature_train)
feature_train = feature_scaler.transform(feature_train)
feature_test = feature_scaler.transform(feature_test)

In [None]:
clf_RF = RandomForestClassifier(n_estimators=30, max_depth=None,min_samples_split=2, random_state=0)
clf_RF = clf_RF.fit(feature_train, y_train)
clf_RF.score(feature_test, y_test)

In [None]:
feature_train.shape

In [None]:
clf_ANN = MLPClassifier(activation='relu', hidden_layer_sizes=(165, ), solver='adam', alpha=1e-5, random_state=1)
clf_ANN.fit(feature_train, y_train)
clf_ANN.score(feature_test, y_test)
#165 minus pic

In [None]:
clf_AB = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=5), n_estimators=300, learning_rate=0.1)
clf_AB = clf_AB.fit(feature_train, y_train)
clf_AB.score(feature_test, y_test)

In [None]:
clf_GB = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, \
                                    max_depth=10, max_features = 'auto').fit(feature_train, y_train)
clf_GB.score(feature_test, y_test)        

In [None]:
print (clf_GB.score(feature_test, y_test))

In [None]:
predicted = clf_GB.predict(feature_test)
print(metrics.classification_report(y_test, predicted, target_names=['Fake Seller', 'No Seller', 'Reseller']))

In [16]:
#strip punctuation and numbers and extract stem of each feature 
from nltk.stem import SnowballStemmer
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics
    
def preprocessor(text):
    processed = re.sub(r'[#|\!|\-|\+|:|//|\']', "", text)
    processed = re.sub(r'(?:(?:\d+,?)+(?:\.?\d+)?)', ' ', processed).strip()
    processed = re.sub('[\s]+', ' ', processed).strip()
    processed =  " ".join([SnowballStemmer("english").stem(word) for word in processed.split()])
    return processed

#K FOLD VALIDATION
def extract_feature(train_idx, test_idx, feature_df):
    X_train = feature_df.iloc[train_idx]
    X_test = feature_df.iloc[test_idx]
    
    # tf-idf
    description_vectorizer = TfidfVectorizer(sublinear_tf=True, ngram_range=(1, 2), preprocessor=preprocessor)
    description_vectorizer.fit(X_train.description.values.astype('U'))
    tfidf_train = description_vectorizer.transform(X_train.description.values.astype('U'))
    tfidf_test = description_vectorizer.transform(X_test.description.values.astype('U'))

    # LSA
    tfidf_lsa = TruncatedSVD(n_components=150, random_state=0)
    reduced_tfidf_train = tfidf_lsa.fit_transform(tfidf_train)
    reduced_tfidf_test = tfidf_lsa.transform(tfidf_test)
    
    # combine text and non-text features
    feature_train = np.hstack((X_train.drop('description', axis = 1), reduced_tfidf_train))
    feature_test = np.hstack((X_test.drop('description', axis = 1), reduced_tfidf_test))
    
    # scale features
    feature_scaler = MinMaxScaler()
    feature_scaler.fit(feature_train)
    feature_train = feature_scaler.transform(feature_train)
    feature_test = feature_scaler.transform(feature_test)
    
    print (feature_train.shape)
    
    return feature_train, feature_test

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

clf_RF = RandomForestClassifier(n_estimators=30, max_depth=None,min_samples_split=2, random_state=0)

clf_AB = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=5), \
                            n_estimators=300, learning_rate=0.1, random_state=None)

clf_GB = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, \
                                    max_depth=10, max_features = 'auto', random_state=0)
clf_ANN = MLPClassifier(activation='relu', hidden_layer_sizes=(165, ), solver='adam', random_state=0)


clf_model = clf_ANN
model_result = []
kf = KFold(n_splits=5, shuffle = True, random_state=0)
i = 1
for train, test in kf.split(features.values):
    print ('Fold {}'.format(i))
    train_final, test_final = extract_feature(train, test, features)
    clf_model.fit(train_final, label[train])
    output = clf_model.score(test_final, label[test])
    model_result.append(output)
    
    print ('Accuracy:{}'.format(output))
    i = i + 1

print ('Overall Accuracy: {}'.format(np.mean(model_result)))

Fold 1
(28145, 164)
Accuracy:0.8040358107147932
Fold 2
(28145, 164)
Accuracy:0.7935199658945573
Fold 3
(28146, 164)
Accuracy:0.7556850483229107
Fold 4
(28146, 164)
Accuracy:0.7819783968163729
Fold 5
(28146, 164)
Accuracy:0.7839681637293917
Overall Accuracy: 0.7838374770956051


In [19]:
ANN_resut = model_result

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

clf_RF = RandomForestClassifier(n_estimators=30, max_depth=None,min_samples_split=2, random_state=0)

clf_AB = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=5), \
                            n_estimators=300, learning_rate=0.1, random_state=None)

clf_GB = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, \
                                    max_depth=10, max_features = 'auto', random_state=0)
clf_ANN = MLPClassifier(activation='relu', hidden_layer_sizes=(165, ), solver='adam', random_state=0)


clf_model = clf_RF
model_result = []
kf = KFold(n_splits=5, shuffle = True, random_state=0)
i = 1
for train, test in kf.split(features.values):
    print ('Fold {}'.format(i))
    train_final, test_final = extract_feature(train, test, features)
    clf_model.fit(train_final, label[train])
    output = clf_model.score(test_final, label[test])
    model_result.append(output)
    
    print ('Accuracy:{}'.format(output))
    i = i + 1

print ('Overall Accuracy: {}'.format(np.mean(model_result)))

Fold 1
(28145, 164)
Accuracy:0.8497939462839278
Fold 2
(28145, 164)
Accuracy:0.8438254938183886
Fold 3
(28146, 164)
Accuracy:0.8439454235361
Fold 4
(28146, 164)
Accuracy:0.8459351904491188
Fold 5
(28146, 164)
Accuracy:0.8439454235361
Overall Accuracy: 0.8454890955247271


In [21]:
rf_result = model_result

In [None]:
%%time

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

clf_RF = RandomForestClassifier(n_estimators=30, max_depth=None,min_samples_split=2, random_state=0)

clf_AB = AdaBoostClassifier(base_estimator = RandomForestClassifier(max_depth=5), \
                            n_estimators=300, learning_rate=0.1, random_state=None)

clf_GB = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, \
                                    max_depth=6, max_features = 'auto', random_state=0)
clf_ANN = MLPClassifier(activation='relu', hidden_layer_sizes=(165, ), solver='adam', random_state=0)


clf_model = clf_GB
model_result = []
kf = KFold(n_splits=5, shuffle = True, random_state=0)
i = 1
for train, test in kf.split(features.values):
    print ('Fold {}'.format(i))
    train_final, test_final = extract_feature(train, test, features)
    # cProfile.run('clf_model.fit(train_final, label[train])')
    clf_model.fit(train_final, label[train])
    output = clf_model.score(test_final, label[test])
    model_result.append(output)
    
    print ('Accuracy:{}'.format(output))
    i = i + 1

print ('Overall Accuracy: {}'.format(np.mean(model_result)))

Fold 1
(28145, 164)
Accuracy:0.8489413102174223
Fold 2
(28145, 164)
Accuracy:0.8415517976410403
Fold 3
(28146, 164)
Accuracy:0.8381182490051166
Fold 4
(28146, 164)


In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
i = 1
worst_train = []
worst_test = []
for train, test in kf.split(features.values):
    if i == 2:
        worst_train =train 
        worst_test = test 
    i = i + 1

In [None]:
worst_test

In [None]:
rf_reult = [0.69546681824641188, 0.66576666192979961,]

In [None]:
ANN_result = [0.69546681824641188, 0.75444081284638342, ]