In [220]:
# reference https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html
# use scikit learn package to classify labels using RandomForest
import pandas as pd
import numpy as np

In [221]:
train = pd.read_csv('train.csv')

In [222]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]

In [223]:
# tokenizing texts with feature extraction text 
from sklearn.feature_extraction.text import CountVectorizer
Vec_Count = CountVectorizer()
Xtrain_counts = Vec_Count.fit_transform(train.text) 

In [224]:
# from occurance to Frequencies 
# to avoid issuses where longer text will have higher average count values than shorter ones
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False).fit(Xtrain_counts)
Xtrain_tfidf = tfidf_transformer.transform(Xtrain_counts)

In [225]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

text = Pipeline([
                ('selector', TextSelector(key='text')),
                #('tfidf', TfidfVectorizer())
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer())
            ])

favoriteCount =  Pipeline([
                ('selector', NumberSelector(key='favoriteCount')),
                ('standard', StandardScaler())
            ])

retweetCount =  Pipeline([
                ('selector', NumberSelector(key='retweetCount')),
                ('standard', StandardScaler())
            ])

In [226]:
from sklearn.pipeline import FeatureUnion

feats = FeatureUnion([('text', text), 
                      ('favoriteCount', favoriteCount),
                      ('retweetCount', retweetCount)])

In [227]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer
# look for subsample k
# max feature 

max_features=np.sqrt(Xtrain_tfidf.shape[1])


clf_model = Pipeline([
    ('features',feats),
    ('classifier', RandomForestClassifier(n_estimators=200,class_weight='balanced_subsample',max_features=60,min_samples_split=2)),
])


clf_model.fit(train, train.label)

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('text', Pipeline(memory=None,
     steps=[('selector', TextSelector(key='text')), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content'...mators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False))])

In [228]:
# read the testing dataset csv file 
test = pd.read_csv('test.csv')

In [229]:
new_predicted = clf_model.predict(test)

In [230]:
# create a dataframe file
pd.DataFrame({'ID':test.id, 'Label':new_predicted}).head()

Unnamed: 0,ID,Label
0,0,1
1,1,-1
2,2,-1
3,3,1
4,4,1


In [231]:
# write the DataFrames indexing from the ID
pd.DataFrame({'ID':test.id, 'Label':new_predicted}).set_index('ID').head()

Unnamed: 0_level_0,Label
ID,Unnamed: 1_level_1
0,1
1,-1
2,-1
3,1
4,1


In [232]:
# write the DataFrames to a CSV file can be submitted to Kaggle
pd.DataFrame({'ID':test.id, 'Label':new_predicted}).set_index('ID').to_csv('sub_randForest10.csv')