# Models using just metadata

## Loading data

In [2]:
import pandas as pd
train_tree = pd.read_csv('train_metafeatures_tree.csv')
test_tree = pd.read_csv('test_metafeatures_tree.csv')
train_normalized = pd.read_csv('train_metafeatures_normalized.csv')
test_normalized = pd.read_csv('test_metafeatures_normalized.csv')

In [3]:
for df in [train_tree, train_normalized]:
    df.drop(columns='id',inplace=True)

In [4]:
train_tree.head()

Unnamed: 0,num_char,num_words,num_hash,num_mention,num_url,has_location,geocoded,longitude_t,latitude_t,target
0,68,13,1,0,0,False,False,1000.0,1000.0,1
1,38,7,0,0,0,False,False,1000.0,1000.0,1
2,133,22,0,0,0,False,False,1000.0,1000.0,1
3,56,7,1,0,0,False,False,1000.0,1000.0,1
4,85,16,2,0,0,False,False,1000.0,1000.0,1


# Train a randomforest

In [5]:
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [6]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-2, random_state=42)

In [7]:
y = train_tree['target']
X = train_tree.drop(columns=['target'])

In [8]:
from sklearn.model_selection import cross_validate
scores = cross_validate(rf, X, y, cv=cv, return_train_score=True, scoring='f1')
scores

{'fit_time': array([1.22259402, 0.3805418 , 0.37985849, 0.37764621, 0.38075972]),
 'score_time': array([0.04386902, 0.03773141, 0.04218435, 0.03549647, 0.03636932]),
 'test_score': array([0.54736842, 0.5562701 , 0.56279809, 0.56384743, 0.56648308]),
 'train_score': array([0.96391153, 0.96195652, 0.96306376, 0.95970411, 0.96394833])}

In [9]:
scores['test_score'].mean()

0.5593534246860525

# Train a boosted tree

In [12]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV
params = {'max_depth': [1,2,3,4], 'n_estimators':[5,10,20,40,80]}
lg = LGBMClassifier(n_jobs=-2, random_state=42)
clf = GridSearchCV(lg, params, scoring="f1", verbose=1, n_jobs=-2, cv=5)

In [13]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=cv, return_train_score=True, scoring='f1')
scores

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    1.9s finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    1.6s finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    1.5s finished


{'fit_time': array([1.97792363, 1.62320352, 1.54509377, 1.55077291, 1.55653834]),
 'score_time': array([0.00713468, 0.00811434, 0.00670052, 0.00675488, 0.00670028]),
 'test_score': array([0.58208955, 0.57860616, 0.59475219, 0.57274119, 0.56855151]),
 'train_score': array([0.57967138, 0.64244898, 0.58733421, 0.58622631, 0.58333333])}

In [14]:
scores['test_score'].mean()

0.5793481205209567

# Train a SVM

In [15]:
from sklearn.svm import SVC
svm = SVC(kernel="rbf", random_state=42)
params = {'C': [0.05, 0.1, 0.2, 0.5, 0.75, 1, 1.5, 2, 4]}
clf = GridSearchCV(svm, params, scoring="f1", verbose=1, n_jobs=-2, cv=5)

In [16]:
y = train_normalized['target']
X = train_normalized.drop(columns='target')

In [17]:
from sklearn.model_selection import cross_validate
scores = cross_validate(clf, X, y, cv=cv, return_train_score=True, scoring='f1')
scores

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   17.8s finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   17.5s finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   18.9s finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   20.4s finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=-2)]: Done  45 out of  45 | elapsed:   20.7s finished


{'fit_time': array([19.06694508, 18.70535755, 20.4974041 , 21.83928394, 22.23850465]),
 'score_time': array([0.16956973, 0.16245985, 0.20043349, 0.20185161, 0.2004962 ]),
 'test_score': array([0.59780908, 0.58418168, 0.59242424, 0.58563536, 0.58751903]),
 'train_score': array([0.5972571 , 0.59480724, 0.59891389, 0.59439614, 0.60299981])}

In [18]:
scores['test_score'].mean()

0.5895138759800866

In [21]:
def prepare_submission(model, X, y, X_test, name):
    fit = model.fit(X,y)
    pred = model.predict(X_test.drop(columns='id'))
    submission = pd.DataFrame({"id":X_test['id'], "target":pred})
    submission.to_csv(name+'.csv', index=False)
    return fit

In [22]:
prepare_submission(svm, X, y, test_normalized, 'metamodel_svm')

SVC(random_state=42)