In [0]:
import xgboost as xgb
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer
import pandas as pd
import re
from sklearn.feature_extraction import text
from nltk.stem import WordNetLemmatizer
import json
from gensim.models import Word2Vec
import nltk
nltk.download('wordnet')

stop_words = set(text.ENGLISH_STOP_WORDS)
rex = re.compile(r'[!"#$%&\()*+,./:;<=>?@\\^_{|}~]+')
lemmatizer = WordNetLemmatizer()

def clean_review(raw_review: str) -> str:
    review_text = rex.sub(' ', raw_review)
    review_text = review_text.lower()
    word_list = review_text.split()
    tokens = list(map(lemmatizer.lemmatize, word_list))
    lemmatized_tokens = list(map(lambda x: lemmatizer.lemmatize(x, "v"), tokens))
    meaningful_words = list(filter(lambda x: not x in stop_words, lemmatized_tokens))
    return meaningful_words

def list2Str(stringList, length):
    str_result = []
    for item in stringList:
        result = ''
        for word in item[:length]:
            result = result + ' ' + word
        str_result.append(result)
    return str_result

def getTrainData(file, label):
    with open(file) as json_file:
      data = json.load(json_file)
      instance = []
      instance_text = []
      instance_label = []
      for sub_data in data:
          instance.append(sub_data)
      for item in instance:
          instance_text.append(clean_review(data[item]["text"]))
          instance_label.append(label)
      instance_text = list2Str(instance_text, length=512)
      instance_text = np.array(instance_text)
      instance_label = np.array(instance_label)
      return instance_text, instance_label

def getTestData(file):
    with open(file) as json_file:
      data = json.load(json_file)
      instance = []
      instance_text = []
      for sub_data in data:
          instance.append(sub_data)
      for item in instance:
          instance_text.append(clean_review(data[item]["text"]))
      instance_text = list2Str(instance_text, length=512)
      instance_text = np.array(instance_text)
      return instance_text

pos_text, pos_label = getTrainData("train.json", 1)
cli_text, cli_label = getTrainData("climate.json", 0)
non_cli_text, non_cli_label = getTrainData("no_climate.json", 0)
dev_text = getTestData("test-unlabelled.json")
print(len(pos_label), len(cli_label), len(non_cli_label), len(dev_text))
X = np.append(pos_text, cli_text, axis=0)
X = np.append(X, non_cli_text, axis = 0)
X = np.append(X, dev_text, axis = 0)
Y = np.append(pos_label, cli_label)
Y = np.append(Y, non_cli_label)
dev_len = len(dev_text)

from sklearn.feature_extraction.text import TfidfVectorizer
cv=TfidfVectorizer(binary=False,decode_error='ignore',stop_words='english')
vec=cv.fit_transform(X)
new_X=vec.toarray()

pos_X = new_X[:len(pos_label)]
cli_X = new_X[len(pos_label):(len(pos_label) + len(cli_label))]
non_cli_X = new_X[(len(pos_label) + len(cli_label)):(len(pos_label) + len(cli_label) + len(non_cli_label))]
dev_X = new_X[-dev_len:]
print(new_X.shape)
print(pos_X.shape)
print(cli_X.shape)
print(non_cli_X.shape)
print(dev_X.shape)

X_train = np.append(pos_X, non_cli_X, axis=0)
Y_train = np.append(pos_label, non_cli_label)
print(X_train.shape)
print(Y_train.shape)

In [0]:
dtrain = xgb.DMatrix(X_train, label=Y_train)
ddev = xgb.DMatrix(dev_X)  
param = {'max_depth':6, 'eta':0.05, 'eval_metric':'merror', 'silent':0, 'objective':'multi:softmax', 'num_class':2}
evallist  = [(dtrain,'train')]
num_round = 200
bst = xgb.train(param, dtrain, num_round, evallist)

In [0]:
preds_cli = bst.predict(ddev)
print(preds_cli)

In [0]:
X_train = np.append(pos_X, cli_X, axis=0)
Y_train = np.append(pos_label, cli_label)
dtrain = xgb.DMatrix(X_train, label=Y_train)
param = {'max_depth':6, 'eta':0.05, 'eval_metric':'merror', 'silent':0, 'objective':'multi:softmax', 'num_class':2}
evallist  = [(dtrain,'train')]
num_round = 200
bst1 = xgb.train(param, dtrain, num_round, evallist)

In [0]:
preds_mis = bst1.predict(ddev)
print(preds_mis)

In [0]:
for i in range(0, len(preds_mis)):
    if preds_cli[i] == 0:
        preds_mis[i] = 0
final_data = pd.DataFrame(preds_mis)
final_data.to_csv("predict_data.csv")

In [0]:
raw_positive = pd.read_csv("predict_data.csv")
positive_len = len(raw_positive)
positive_data = []
for i in range(0, positive_len):
    row = np.array(raw_positive.iloc[i])
    positive_data.append(row[1])
positive_data = np.array(positive_data)
print(positive_data)

dict = {}
submit = './test-output.json'
i = 0
print(len(np.argwhere(positive_data == 1)))
for item in positive_data:
    temp_dict = {}
    temp_dict['label'] = int(item)
    print(temp_dict)
    dict["test-" + str(i)] = temp_dict
    i = i + 1
print(dict)
with open(submit, 'w') as f:
    json.dump(dict, f)