In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:

        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [2]:
TRAINSET_PATH = './modified.csv'
df = pd.read_csv(TRAINSET_PATH, encoding="utf-8")
print(df.shape)

(328932, 2)


In [3]:
# Cast the type of User_id field from int to string 
df.user_id = df.user_id.apply(str)
df.user_id.value_counts()

4185    284
5519    273
319     273
4267    265
4562    264
       ... 
4542      1
691       1
1521      1
8672      1
2580      1
Name: user_id, Length: 9297, dtype: int64

In [4]:
# IMPORTANT: Tweets Preprocessing
import re
import nltk as nlp

pattern = r'((http|ftp|https):\/\/)?([\w-]+(?:(?:\.[\w-]{2,})+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'

def pre_processing(tweet):
    tweet = re.sub(pattern, '', tweet)
    tweet = re.sub('\s\W',' ',tweet)
    tweet = re.sub('\W,\s',' ',tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = re.sub("\d+", " ", tweet)
    tweet = re.sub('\s+',' ',tweet)
    tweet = re.sub('[!@#$_]', ' ', tweet)
    tweet = tweet.lower()

    lemma = nlp.WordNetLemmatizer()
    stemmed_tweet = ""
    for word in tweet.split():
        stemmed_tweet += lemma.lemmatize(word)
        stemmed_tweet += " "
    return stemmed_tweet.strip()

df.tweet = df.tweet.apply(pre_processing)

In [5]:
temp = df[df.tweet.str.contains('ybox')]

In [6]:
# remove non important word a, the, that, and, in 
# nlp.download("stopwords")  # stopwords = (irrelavent words)
from nltk.corpus import stopwords

In [7]:
english_stopwords = set(stopwords.words('english'))

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
ALL_TWEETS = df.tweet.tolist()
y = df.user_id
# # Find a way not to limit max features, or not pruning according to TF
vectorizer = TfidfVectorizer(max_features=10000, lowercase=False, analyzer='word', stop_words=english_stopwords)  
X = vectorizer.fit_transform(ALL_TWEETS)
# # >>> print(X.shape)
# # Display All Feature(term) names
# # >>> print(vectorizer.get_feature_names()) 

In [9]:
print(X.shape)
# print(vectorizer.get_feature_names()) 

(328932, 10000)


In [10]:
# Train a classfier for a specific user based on logstic regression
from sklearn.model_selection import train_test_split
x = X
y = df.user_id
RANDOM_SEED = int.from_bytes("Group95".encode(), 'little') % (2**32 - 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)
y_train_transformed = y_train.transform(lambda x : 1 if x == '4185' else 0)
y_test_transformed = y_test.transform(lambda x : 1 if x == '4185' else 0)


In [11]:
y_train_transformed.value_counts()

0    262927
1       218
Name: user_id, dtype: int64

In [11]:
# Author: Olivier Grisel <olivier.grisel@ensta.org>
#         Peter Prettenhofer <peter.prettenhofer@gmail.com>
#         Mathieu Blondel <mathieu@mblondel.org>
# License: BSD 3 clause
from pprint import pprint
from time import time
import logging

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

print(__doc__)

# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')


# #############################################################################
# # Load some categories from the training set
# categories = [
#     'alt.atheism',
#     'talk.religion.misc',
# ]
# # Uncomment the following to do the analysis on all the categories
# #categories = None

# print("Loading 20 newsgroups dataset for categories:")
# print(categories)

# data = fetch_20newsgroups(subset='train', categories=categories)

y_transformed = y.transform(lambda x : 1 if x == '4185' else 0)
print("%d tweets" % len(ALL_TWEETS))
print("%d categories" % len(y_transformed.value_counts()))
print()

# #############################################################################
# Define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(lowercase=False, analyzer='word', stop_words=english_stopwords)),
    ('clf', LogisticRegression(solver='lbfgs')),
])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'tfidf__max_df': (0.5, 0.75, 1.0),
    'tfidf__max_features': (None, 5000, 10000, 50000),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
#     'tfidf__use_idf': (True, False),
    # 'tfidf__norm': ('l1', 'l2'),
    'clf__max_iter': (100,),
    'clf__C': (0.01, 0.05, 0.1, 0.3, 0.5, 1.0),
#     'clf__penalty': ('l2', 'l1'),
    'clf__class_weight': ({1:30}, {1:40}, {1:50}, {1:100}) 
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected
    # block

    # find the best parameters for both the feature extraction and the
    # classifier
    grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1)

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(ALL_TWEETS, y_transformed)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

Automatically created module for IPython interactive environment
328932 tweets
2 categories

Performing grid search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__C': (0.01, 0.05, 0.1, 0.3, 0.5, 1.0),
 'clf__class_weight': ({1: 30}, {1: 40}, {1: 50}, {1: 100}),
 'clf__max_iter': (100,),
 'tfidf__max_df': (0.5, 0.75, 1.0),
 'tfidf__max_features': (None, 5000, 10000, 50000),
 'tfidf__ngram_range': ((1, 1), (1, 2))}
Fitting 5 folds for each of 576 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:  9.0min
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed: 21.4min


KeyboardInterrupt: 

In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=.05, solver='lbfgs', class_weight={1:50})
clf.fit(x_train, y_train_transformed)

from sklearn.metrics import accuracy_score, f1_score
y_test_pred = pd.Series(clf.predict(x_test))
# print(y_test_pred.value_counts())
# print(y_test_transformed.value_counts())
# >>> accuracy_score(y_test_transformed, y_test_pred)
print(f1_score(y_test_transformed, y_test_pred, pos_label=1, average='binary'))

from sklearn.metrics import classification_report
print(classification_report(y_test_transformed, y_test_pred))

0    65730
1       57
dtype: int64
0    65721
1       66
Name: user_id, dtype: int64
0.13008130081300812
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65721
           1       0.14      0.12      0.13        66

    accuracy                           1.00     65787
   macro avg       0.57      0.56      0.56     65787
weighted avg       1.00      1.00      1.00     65787



In [92]:
proba_distro = clf.predict_proba(x_test)

array([[0.9845969 , 0.0154031 ],
       [0.98343872, 0.01656128],
       [0.99400227, 0.00599773],
       ...,
       [0.9110532 , 0.0889468 ],
       [0.98648495, 0.01351505],
       [0.98277848, 0.01722152]])

In [104]:
np.where(proba_distro[:,1] > 0.5)

(array([ 3193,  7999,  8447,  9297, 15701, 19385, 24120, 24615, 25318,
        30555, 38278, 41241, 48007, 48619, 49201, 51298, 53189, 54894,
        57602, 58805, 59946, 65051], dtype=int64),)

In [67]:
(y_train_transformed == 1)

263145

In [None]:
# Train a classifier for multiple classes
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(x_train, y_train)

from sklearn.metrics import accuracy_score
y_test_pred = pd.Series(clf.predict(x_test))
accuracy_score(y_test_transformed, y_test)

In [None]:
# Feature Extraction using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
ALL_TWEETS = df.tweet.tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(ALL_TWEETS)
# >>> print(X.shape)
# Display All Feature(term) names
# >>> print(vectorizer.get_feature_names()) 

# Train a classfier for a specific user based on logstic regression
from sklearn.model_selection import train_test_split
x = X
y = df.user_id
RANDOM_SEED = int.from_bytes("Group95".encode(), 'little') % (2**32 - 1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=RANDOM_SEED)

y_train_transformed = y_train.transform(lambda x : 1 if x == '4185' else 0)
y_test_transformed = y_test.transform(lambda x : 1 if x == '4185' else 0)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=.3, solver='lbfgs')
clf.fit(x_train, y_train_transformed)

from sklearn.metrics import accuracy_score
y_test_pred = pd.Series(clf.predict(x_test))
y_test_pred.value_counts()
y_test_transformed.value_counts()
# >>> accuracy_score(y_test_transformed, y_test_pred)
