In [1]:
import numpy as np
from scipy import sparse
from collections import Counter
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd

stop_words = set(stopwords.words('english') + list(string.punctuation))

In [2]:
def tokenize(text):
    '''
    :param text: a doc with multiple sentences, type: str
    return a word list, type: list
    https://textminingonline.com/dive-into-nltk-part-ii-sentence-tokenize-and-word-tokenize
    e.g. 
    Input: 'It is a nice day. I am happy.'
    Output: ['it', 'is', 'a', 'nice', 'day', 'i', 'am', 'happy']
    '''
    tokens = []
    # YOUR CODE HERE
    for word in nltk.word_tokenize(text):
        word = word.lower()
        if word not in stop_words and not word.isnumeric():
            tokens.append(word)
    return tokens
#     tokens_stem = stem(tokens)
#     return tokens_stem

In [3]:

def bigram(tokens):
    bigram = []
#     tokens=stem(tokens)
    for i in range(len(tokens)-1):
        big = (tokens[i], tokens[i+1])
        bigram.extend(big)
    return bigram
    

In [4]:
from nltk.stem.porter import *
def stem(tokens):
    stemmer = PorterStemmer()
    tokens_stem = [stemmer.stem(token) for token in tokens]
    return tokens_stem

In [5]:
def get_bagofwords(data, vocab_dict):
    '''
    :param data: a list of words, type: list
    :param vocab_dict: a dict from words to indices, type: dict
    return a word (sparse) matrix, type: scipy.sparse.csr_matrix
    https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.lil_matrix.html
    https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html
    '''
    data_matrix = sparse.lil_matrix((len(data), len(vocab_dict)))
    # YOUR CODE HERE
    for i, doc in enumerate(data):
        for word in doc:
            word_idx = vocab_dict.get(word, -1)
            #return -1 if not in dict
            if word_idx != -1:
                data_matrix[i, word_idx] += 1

    data_matrix = data_matrix.tocsr() #to speed up in computation

    return data_matrix

In [6]:
def read_data(file_name, vocab=None):
    """
    https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html
    """
    df = pd.read_csv(file_name)
    df['words'] = df['text'].apply(tokenize)
    df['bigrams'] = df['words'].apply(bigram)
    
#     bigram = set()
    if vocab is None:
        vocab = set()
        for i in range(len(df)):
            for word in df.iloc[i]['words']:
                vocab.add(word)
            for big in df.iloc[i]['bigrams']:
                vocab.add(big)
                
    vocab_dict = dict(zip(vocab, range(len(vocab))))
#     bigram_dict = dict(zip(bigram, range(len(bigram))))

    data_matrix = get_bagofwords(df['words']+df['bigrams'], vocab_dict)
    # import pdb
    # pdb.set_trace()

    return df['id'], df['label'], data_matrix, vocab

In [7]:
def normalize(P):
    """
    normalize P to make sure the sum of every row equals to 1
    e.g.
    Input: [1,2,1,2,4]
    Output: [0.1,0.2,0.1,0.2,0.4] (without laplace smoothing) or [0.1333,0.2,0.1333,0.2,0.3333] (with laplace smoothing)
    """
    # YOUR CODE HERE
    # with out Laplace smoothing
    # norm = np.sum(P, axis = 0, keepdims = True)
    # P_ = P / norm

    # with Laplace smoothing
    K = P.shape[0]
    norm = np.sum(P, axis = 0, keepdims = True)
    P_ = (P + 1.0) / (norm + K)
    
    return P_

In [8]:
def evaluate(y_true, y_pre):
    assert len(y_true) == len(y_pre)
    acc = accuracy_score(y_true, y_pre)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pre, average="macro")
    return acc, precision, recall, f1

In [9]:
#Read Data

train_id_list, train_data_label, train_data_matrix, vocab = read_data("train.csv")
print("Vocabulary Size:", len(vocab))
print("Training Set Size:", len(train_id_list))
test_id_list, _, test_data_matrix, _ = read_data("test.csv", vocab)
print("Test Set Size:", len(test_id_list))

Vocabulary Size: 70839
Training Set Size: 16000
Test Set Size: 4491


In [10]:
#TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
# train_data = transformer.fit_transform(train_data_matrix)
# test_data = transformer.fit_transform(test_data_matrix)
train_data = train_data_matrix
test_data = test_data_matrix


In [11]:
#Split Training Set

from sklearn.model_selection import train_test_split
X_train, X_eva, y_train, y_eva = train_test_split(train_data, train_data_label, test_size=0.33)

from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
parameters = {'learning_rate':(0.1,0.2,0.3), 'feature_fraction':(0.8,0.9,1), 'bagging_fraction':(0.8,0.9,1), 
              'max_depth':(3,4,5,6)}
lgbmodel=lgb.LGBMClassifier(boosting_type='gbdt')
clf = GridSearchCV(lgbmodel,parameters, cv=5, scoring='accuracy')
clf.fit(train_data, train_data_label)

clf.best_params_

from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_eva)

In [12]:
#Train

import lightgbm as lgb
clf=lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.2, feature_fraction=0.9, bagging_fraction=0.8)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_eva)

  if diff:


In [13]:
#Result on Evaluation Set

acc, precision, recall, f1 = evaluate(y_pred, y_eva)
    # import pdb; pdb.set_trace()
print("Evalution: Accuracy: %f\tPrecision: %f\tRecall: %f\tMacro-F1: %f" % (acc, precision, recall, f1))

Evalution: Accuracy: 0.588068	Precision: 0.485395	Recall: 0.534248	Macro-F1: 0.501397


In [14]:
import lightgbm as lgb
clf=lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.2, feature_fraction=0.9, bagging_fraction=0.8)
# clf_final=lgb.LGBMClassifier(boosting_type='gbdt', learning_rate=0.3)
clf.fit(train_data, train_data_label )
pred = clf.predict(test_data)

  if diff:


In [15]:
sub_df = pd.DataFrame()
sub_df["id"] = test_id_list
sub_df['pred'] = pred
len(test_id_list)
sub_df.to_csv("sub_lgb_"+"-"+"bi"".csv", index=False)