In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
%matplotlib inline
import matplotlib.pyplot as plt
import re
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import warnings
import operator
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif
warnings.filterwarnings('ignore')

np.set_printoptions(precision=3)

global checkinsStat, twitterLDAStat, twitterLIWCStat, twitterTextStat, instConceptsStst, train, test, ground_truth

In [None]:
# Reading Data
def read_data():
    global checkinsStat, twitterLDAStat, twitterLIWCStat, twitterTextStat, instConceptsStst, train, test, ground_truth
    # Foursquare 
    checkinsStat = pd.read_csv('featuresLondon/Foursquare/venueCategoriesFeatures5Months.csv')
    #checkins = pd.read_csv('featuresNewYork/Foursquare/checkinsSingapore.csv', error_bad_lines=False)

    # Twitter
    twitterLDAStat = pd.read_csv('featuresLondon/Twitter/LDA50Features.csv')
    twitterLIWCStat = pd.read_csv('featuresLondon/Twitter/LIWCFeatures.csv')
    twitterTextStat = pd.read_csv('featuresLondon/Twitter/manuallyDefinedTextFeatures.csv')

    # TODO twits = ??

    # Instagram
    instConceptsStst = pd.read_csv('featuresLondon/Instagram/imageConceptsFeatures.csv')

    # Ground truth
    train = pd.read_csv('LondonGroundTruth.csv', encoding = "ISO-8859-1")
    train.rename(columns = {'row ID' : '_id'}, inplace=True)
    train = train.dropna(subset=['relationship'])


#     test = pd.read_csv('NYTest.csv')
#     test.rename(columns = {'row ID' : '_id'}, inplace=True)
#     test = test.dropna(subset=['relationship'])


#     ground_truth = train.append(test, ignore_index = True)
    ground_truth = train
    ground_truth['rand'] = np.random.random_sample((len(ground_truth['_id'],)))
    ground_truth = ground_truth[['_id', 'relationship', 'rand']]
    ground_truth.relationship[ground_truth['relationship'] != 'single'] = int(0)
    ground_truth.relationship[ground_truth['relationship'] == 'single'] = int(1)
    return

In [None]:
# Prepare data for checkins history
read_data()
checkinsStat = checkinsStat.merge(ground_truth, on = '_id')

train = checkinsStat[checkinsStat['rand'] > 0.1]
test = checkinsStat[checkinsStat['rand'] <= 0.1]

X_train = train.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_train = train['relationship']

X_test = test.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_test = test['relationship']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))

In [None]:
# Feature Selection
test = SelectKBest(score_func = chi2, k = 100)
fit = test.fit(X_train, Y_train.astype(int))
X_train = fit.transform(X_train)
X_test = fit.transform(X_test)

SVC = svm.SVC()
SVC.fit(X_train, Y_train.astype(int))
print(SVC.score(X_test, Y_test.astype(int)))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train.astype(int))
print(GBC.score(X_test, Y_test.astype(int)))

LG = LogisticRegression()
LG.fit(X_train, Y_train.astype(int))
print(LG.score(X_test, Y_test.astype(int)))

In [None]:
# Models for checking history
SVC = svm.SVC()
SVC.fit(X_train, Y_train.astype(int))
print(SVC.score(X_test, Y_test.astype(int)))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train.astype(int))
print(GBC.score(X_test, Y_test.astype(int)))


LG = LogisticRegression()
LG.fit(X_train, Y_train.astype(int))
print(LG.score(X_test, Y_test.astype(int)))


In [None]:
# Prepare data for twitter statistics
read_data()
twitterStat = ground_truth.merge(twitterTextStat, on = '_id')
twitterStat = twitterStat.merge(twitterLDAStat, on = '_id')
train = twitterStat[twitterStat['rand'] > 0.2]
test = twitterStat[twitterStat['rand'] <= 0.2]

X_train = train.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_train = train['relationship']

X_test = test.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_test = test['relationship']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))

In [None]:
# Feature Selection
test = SelectKBest(score_func = f_classif, k = 30)
fit = test.fit(X_train, Y_train.astype(int))
X_train = fit.transform(X_train)
X_test = fit.transform(X_test)

SVC = svm.SVC()
SVC.fit(X_train, Y_train.astype(int))
print(SVC.score(X_test, Y_test.astype(int)))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train.astype(int))
print(GBC.score(X_test, Y_test.astype(int)))

LG = LogisticRegression()
LG.fit(X_train, Y_train.astype(int))
print(LG.score(X_test, Y_test.astype(int)))

In [None]:
# Models for twitter history
SVC = svm.SVC()
SVC.fit(X_train, Y_train.astype(int))
print(SVC.score(X_test, Y_test.astype(int)))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train.astype(int))
print(GBC.score(X_test, Y_test.astype(int)))

LG = LogisticRegression()
LG.fit(X_train, Y_train.astype(int))
print(LG.score(X_test, Y_test.astype(int)))

In [None]:
# Prepare data for instagram statistics
read_data()
instConceptsStst = ground_truth.merge(instConceptsStst, on = '_id')
train = instConceptsStst[instConceptsStst['rand'] > 0.2]
test = instConceptsStst[instConceptsStst['rand'] <= 0.2]

X_train = train.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_train = train['relationship']

X_test = test.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_test = test['relationship']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))

In [None]:
# Models for instagram history
SVC = svm.SVC()
SVC.fit(X_train, Y_train.astype(int))
print(SVC.score(X_test, Y_test.astype(int)))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train.astype(int))
print(GBC.score(X_test, Y_test.astype(int)))

LG = LogisticRegression()
LG.fit(X_train, Y_train.astype(int))
print(LG.score(X_test, Y_test.astype(int)))

In [None]:
# Twitter + Foursware
read_data()
twitterStat = ground_truth.merge(twitterTextStat, on = '_id')
twitterStat = twitterStat.merge(twitterLDAStat, on = '_id')
TwitterFoursqare = twitterStat.merge(checkinsStat, on = '_id')

train = TwitterFoursqare[TwitterFoursqare['rand'] > 0.1]
test = TwitterFoursqare[TwitterFoursqare['rand'] <= 0.1]

X_train = train.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_train = train['relationship']

X_test = test.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_test = test['relationship']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))



In [None]:
# Feature Selection
max_score = 0
max_it = 0
for i in range(50, 450):
    test = SelectKBest(k = i)
    fit = test.fit(X_train, Y_train.astype(int))
    X_train_it = fit.transform(X_train)
    X_test_it = fit.transform(X_test)

    SVC = svm.SVC()
    SVC.fit(X_train_it, Y_train.astype(int))
    f1_svc = SVC.score(X_test_it, Y_test.astype(int))

    GBC = GradientBoostingClassifier()
    GBC.fit(X_train_it, Y_train.astype(int))
    f1_gbc = GBC.score(X_test_it, Y_test.astype(int))

    LG = LogisticRegression()
    LG.fit(X_train_it, Y_train.astype(int))
    f1_lg = LG.score(X_test_it, Y_test.astype(int))
    
    max_it_score = max(f1_svc, f1_gbc, f1_lg)
    if max_it_score > max_score:
        max_score = max_it_score
        max_it = i
        print(max_score)
        
print ("Max Score: " + str(max_score))
print ("KBest: " + str(max_it))


In [None]:
# Models for Twitter + Foursquare
SVC = svm.SVC()
SVC.fit(X_train, Y_train.astype(int))
print(SVC.score(X_test, Y_test.astype(int)))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train.astype(int))
print(GBC.score(X_test, Y_test.astype(int)))

LG = LogisticRegression()
LG.fit(X_train, Y_train.astype(int))
print(LG.score(X_test, Y_test.astype(int)))


In [None]:
read_data()
# Twitter + Instagram
TwitterInstagram = twitterStat.merge(instConceptsStst, on = '_id')
TwitterInstagram = ground_truth.merge(TwitterInstagram, on = '_id')


train = TwitterInstagram[TwitterInstagram['rand_x'] > 0.1]
test = TwitterInstagram[TwitterInstagram['rand_x'] <= 0.1]

X_train = train.drop(['_id', 'relationship_x', 'rand_x', 'relationship_y', 'rand_y'], axis = 1)
Y_train = train['relationship_x']

X_test = test.drop(['_id','relationship_x', 'rand_x', 'relationship_y', 'rand_y'], axis = 1)
Y_test = test['relationship_x']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))

In [None]:
# Feature Selection
max_score = 0
max_it = 0
for i in range(50, 600):
    test = SelectKBest(k = i)
    fit = test.fit(X_train, Y_train.astype(int))
    X_train_it = fit.transform(X_train)
    X_test_it = fit.transform(X_test)

    SVC = svm.SVC()
    SVC.fit(X_train_it, Y_train.astype(int))
    f1_svc = SVC.score(X_test_it, Y_test.astype(int))

    GBC = GradientBoostingClassifier()
    GBC.fit(X_train_it, Y_train.astype(int))
    f1_gbc = GBC.score(X_test_it, Y_test.astype(int))

    LG = LogisticRegression()
    LG.fit(X_train_it, Y_train.astype(int))
    f1_lg = LG.score(X_test_it, Y_test.astype(int))
    
    max_it_score = max(f1_svc, f1_gbc, f1_lg)
    if max_it_score > max_score:
        max_score = max_it_score
        max_it = i
        print(max_score)
        
print ("Max Score: " + str(max_score))
print ("KBest: " + str(max_it))


In [None]:
# Models for Twitter + Instagram
SVC = svm.SVC()
SVC.fit(X_train, Y_train.astype(int))
print(SVC.score(X_test, Y_test.astype(int)))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train.astype(int))
print(GBC.score(X_test, Y_test.astype(int)))

LG = LogisticRegression()
LG.fit(X_train, Y_train.astype(int))
print(LG.score(X_test, Y_test.astype(int)))

In [None]:
# Foursquare + Instagram
read_data()
FoursquareInstagram = checkinsStat.merge(instConceptsStst, on = '_id')
FoursquareInstagram = ground_truth.merge(FoursquareInstagram, on = '_id')

train = FoursquareInstagram[FoursquareInstagram['rand'] > 0.2]
test = FoursquareInstagram[FoursquareInstagram['rand'] <= 0.2]

X_train = train.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_train = train['relationship']

X_test = test.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_test = test['relationship']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))

In [None]:
max_score = 0
max_it = 0
for i in range(50, 1200):
    test = SelectKBest(k = i)
    fit = test.fit(X_train, Y_train.astype(int))
    X_train_it = fit.transform(X_train)
    X_test_it = fit.transform(X_test)

    SVC = svm.SVC()
    SVC.fit(X_train_it, Y_train.astype(int))
    f1_svc = SVC.score(X_test_it, Y_test.astype(int))

    GBC = GradientBoostingClassifier()
    GBC.fit(X_train_it, Y_train.astype(int))
    f1_gbc = GBC.score(X_test_it, Y_test.astype(int))

    LG = LogisticRegression()
    LG.fit(X_train_it, Y_train.astype(int))
    f1_lg = LG.score(X_test_it, Y_test.astype(int))
    
    max_it_score = max(f1_svc, f1_gbc, f1_lg)
    if max_it_score > max_score:
        max_score = max_it_score
        max_it = i
        print("Iteration: " + str(i) + " Score: " + str(max_score))
        
print ("Max Score: " + str(max_score))
print ("KBest: " + str(max_it))

In [None]:
# Models for Foursquare + Instagram
SVC = svm.SVC()
SVC.fit(X_train, Y_train)
f1_svc = f1_score(Y_test, SVC.predict(X_test))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train)
f1_gbc = f1_score(Y_test, GBC.predict(X_test))

LG = LogisticRegression()
LG.fit(X_train, Y_train)
f1_lg = f1_score(Y_test, LG.predict(X_test))

print("SVC f score " + str(f1_svc))
print("GBC f score " + str(f1_gbc))
print("LG f score " + str(f1_lg))

In [None]:
# Foursquare + Instagram + Twitter
read_data()
FoursquareInstagramTwitter = checkinsStat.merge(instConceptsStst, on = '_id')
FoursquareInstagramTwitter = FoursquareInstagramTwitter.merge(twitterTextStat, on = '_id')
FoursquareInstagramTwitter = ground_truth.merge(FoursquareInstagramTwitter, on = '_id')

train = FoursquareInstagram[FoursquareInstagram['rand'] > 0.2]
test = FoursquareInstagram[FoursquareInstagram['rand'] <= 0.2]

X_train = train.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_train = train['relationship']

X_test = test.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_test = test['relationship']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))

In [None]:
max_score = 0
max_it = 0
for i in range(50, 1200):
    test = SelectKBest(k = i)
    fit = test.fit(X_train, Y_train.astype(int))
    X_train_it = fit.transform(X_train)
    X_test_it = fit.transform(X_test)

    SVC = svm.SVC()
    SVC.fit(X_train_it, Y_train.astype(int))
    f1_svc = SVC.score(X_test_it, Y_test.astype(int))

    GBC = GradientBoostingClassifier()
    GBC.fit(X_train_it, Y_train.astype(int))
    f1_gbc = GBC.score(X_test_it, Y_test.astype(int))

    LG = LogisticRegression()
    LG.fit(X_train_it, Y_train.astype(int))
    f1_lg = LG.score(X_test_it, Y_test.astype(int))
    
    max_it_score = max(f1_svc, f1_gbc, f1_lg)
    if max_it_score > max_score:
        max_score = max_it_score
        max_it = i
        print("Iteration: " + str(i) + " Score: " + str(max_score))
        
print ("Max Score: " + str(max_score))
print ("KBest: " + str(max_it))

In [None]:
# Models for Foursquare + Instagram + Twitter
SVC = svm.SVC()
SVC.fit(X_train, Y_train)
f1_svc = f1_score(Y_test, SVC.predict(X_test))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train)
f1_gbc = f1_score(Y_test, GBC.predict(X_test))

LG = LogisticRegression()
LG.fit(X_train, Y_train)
f1_lg = f1_score(Y_test, LG.predict(X_test))

print("SVC f score " + str(f1_svc))
print("GBC f score " + str(f1_gbc))
print("LG f score " + str(f1_lg))

In [None]:
selectionMultiData = FoursquareInstagramTwitter[['_id', 'relationship', 'rand', 'numberOfImages', 
                                                 'numberOfTweets', 'categoryMentions']]

train = selectionMultiData[selectionMultiData['rand'] > 0.2]
test = selectionMultiData[selectionMultiData['rand'] <= 0.2]

X_train = train.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_train = train['relationship']

X_test = test.drop(['_id', 'relationship', 'rand'], axis = 1)
Y_test = test['relationship']

print ("Train data size: " + str(len(Y_train)))
print ("Test data size: " + str(len(Y_test)))

In [None]:
# Selection models for Foursquare + Instagram + Twitter
SVC = svm.SVC()
SVC.fit(X_train, Y_train)
f1_svc = f1_score(Y_test, SVC.predict(X_test))

GBC = GradientBoostingClassifier()
GBC.fit(X_train, Y_train)
f1_gbc = f1_score(Y_test, GBC.predict(X_test))

LG = LogisticRegression()
LG.fit(X_train, Y_train)
f1_lg = f1_score(Y_test, LG.predict(X_test))

print("SVC f score " + str(f1_svc))
print("GBC f score " + str(f1_gbc))
print("LG f score " + str(f1_lg))