# ðŸ§” Om vs AI (Human vs AI) ðŸ¤–

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Load data

In [2]:
df_train = pd.read_csv("train_data.csv").drop("ID", axis=1)
df_test = pd.read_csv("test_data.csv")
df_test1, df_test2 = df_test[df_test["subtaskID"] == 1].drop("subtaskID", axis=1), \
                     df_test[df_test["subtaskID"] == 2].drop("subtaskID", axis=1)
df_train

Unnamed: 0,text,label
0,"In ""Tie Challenge of Exploring Venus,"" tie aut...",0.0
1,These sources focus on the advantages of limit...,0.0
2,This new technology can really figure out if y...,0.0
3,"In today's society, technology is used everyda...",0.0
4,Limiting car usage is a topic that has been g...,1.0
...,...,...
9995,"In my opinion, the Electoral College is not an...",0.0
9996,What if you could go to school at home OE you'...,0.0
9997,I am all for this new technology to read peopl...,0.0
9998,"Call, though useful, have negative impacts on ...",0.0


## Subtask 1

In [3]:
X_train, y_train = df_train["text"], df_train["label"]
X_test1 = df_test1["text"]

In [4]:
pipeline = Pipeline([
    ("tf", TfidfVectorizer()),
    ("lr", SVC(random_state=42))
])

In [5]:
cross_val_score(pipeline, X_train, y_train, cv=5, scoring="f1")

array([0.93573265, 0.93975904, 0.92881944, 0.92493529, 0.92728828])

In [6]:
pipeline.fit(X_train, y_train)
preds1 = pipeline.predict(X_test1)

In [7]:
subtask1_rows = []
for id_, val in zip(df_test1["ID"], preds1):
    subtask1_rows.append((1, id_, val))

## Subtask 2

In [8]:
X_test2 = df_test2["text"]
X_test2

4763    Richard Smith, the CEO of credit monitoring co...
4764    As average temperatures rise across the planet...
4765    In a complaint filed Wednesday, the Securities...
4766    Imagine a world in which women without viable ...
4767    Two Arizona women, Tahnee Gonzales and Elizabe...
                              ...                        
5558    A woman in New York was charged with attempted...
5559    When I lived in Appalachia, an elementary scho...
5560    STOCKHOLM/LONDON (Reuters) - A trio of Swiss, ...
5561    by Geoff West Washington lobbying groups spent...
5562    Millennials have been getting hammered by the ...
Name: text, Length: 800, dtype: object

In [9]:
stop_words = set(stopwords.words("english"))

In [10]:
def preprocess(entries):
    result = []
    for entry in entries:
        tokens = list(filter(lambda x: x not in stop_words, simple_preprocess(entry.lower())))
        result.append(tokens)
    return result

In [11]:
X_test2_tok = preprocess(X_test2)

In [12]:
documents = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(X_test2_tok)]

d2v = Doc2Vec(vector_size=60, window=5, min_count=1, workers=1, epochs=80, seed=42)
d2v.build_vocab(documents)
d2v.train(documents, total_examples=d2v.corpus_count, epochs=d2v.epochs)

In [13]:
X_test2_vec = np.vstack([d2v.dv[i] for i in range(len(X_test2_tok))])
X_test2_vec /= np.linalg.norm(X_test2_vec, axis=1, keepdims=True)  # Normalize to unit vectors
X_test2_vec.shape

(800, 60)

In [14]:
km = KMeans(4, random_state=42)
preds = km.fit_predict(X_test2_vec)

np.unique(preds, return_counts=True)

(array([0, 1, 2, 3]), array([206, 203, 198, 193], dtype=int64))

In [15]:
df_res = pd.DataFrame({"text": X_test2, "cluster": preds})
df_res[df_res["cluster"] == 0]

Unnamed: 0,text,cluster
4763,"Richard Smith, the CEO of credit monitoring co...",0
4765,"In a complaint filed Wednesday, the Securities...",0
4772,"Since fall always feels like a reset, thereâ€™s ...",0
4778,"Six months after acquiring Whole Foods, Amazon...",0
4784,"Hackers breached MyFitnessPal, a popular calor...",0
...,...,...
5545,"Uber, which just this week reportedly pickedÂ a...",0
5550,Why your diversity and unconscious bias traini...,0
5551,"Since its launch in 2013, neo-Nazi website The...",0
5561,by Geoff West Washington lobbying groups spent...,0


In [16]:
df_res[df_res["cluster"] == 1]

Unnamed: 0,text,cluster
4769,What do you imagine God thinks of all the call...,1
4770,The thousands of clergy members who comprise T...,1
4771,"JERUSALEM, June 25 (Reuters) - Israelâ€™s govern...",1
4776,"In these times, the Rev. Jennifer Butler, CEO ...",1
4777,More than 100 interfaith demonstrators gathere...,1
...,...,...
5541,"More Americans are embracing the holly, jolly,...",1
5547,Twitter is dragging Joel Osteen. And he deserv...,1
5553,"76 years ago this week, Karolina Cohn was torn...",1
5554,Indians have started observing the festival of...,1


In [17]:
df_res[df_res["cluster"] == 2]

Unnamed: 0,text,cluster
4767,"Two Arizona women, Tahnee Gonzales and Elizabe...",2
4768,Billionaire Dallas Mavericks owner Mark Cuban ...,2
4774,A far-right media personality has been barred ...,2
4775,An 18-year-old California man confessed this w...,2
4788,A Michigan man was arrested by federal authori...,2
...,...,...
5548,A teen suspect is in custody after police resp...,2
5549,A former pro wrestler was brutally beaten and ...,2
5555,Police arrested a 19-year-old man just after m...,2
5557,The man suspected of murdering Blaze Bernstein...,2


In [18]:
df_res[df_res["cluster"] == 3]

Unnamed: 0,text,cluster
4764,As average temperatures rise across the planet...,3
4766,Imagine a world in which women without viable ...,3
4773,Climate change has already touched almost all ...,3
4779,"In July, Harvard scientists used a gene-editin...",3
4780,Scientists in Australia are aiming a huge dish...,3
...,...,...
5544,Nine former lab chimpanzees are settling into ...,3
5546,"Since 1995, Stanford mathematician Keith Devli...",3
5552,Former astronaut Buzz Aldrin was airlifted fro...,3
5556,British theoretical physicist and cosmologist ...,3


In [19]:
num2topic = {3: "SCIENCE", 0: "BUSINESS", 1: "RELIGION", 2: "CRIME"}  # Idenfity these manually
preds2 = df_res["cluster"].map(num2topic)

In [20]:
subtask2_rows = []
for id_, val in zip(df_test2["ID"], preds2):
    subtask2_rows.append((2, id_, val))

## Save answers

In [21]:
submission_rows = subtask1_rows + subtask2_rows
df_submission = pd.DataFrame(submission_rows, columns=["subtaskID", "datapointID", "answer"])
df_submission.to_csv("submission.csv", index=False)

## Submission results

Subtask 1:
- F1: 0.956915
- Score: 60/60

Subtask 2:
- Accuracy: 0.9725
- Score: 40/40
