In [None]:
import pandas as pd
import sqlite3

In [None]:
sql = sqlite3.connect("technology.db")

In [None]:
posts = pd.read_sql("SELECT id, url, created_utc, title, flair, parent_id, author, score, selftext, body FROM posts p", 
                  sql, parse_dates=["created_utc"])

In [None]:
top2015 = posts[(posts["created_utc"].dt.year>=2015) & 
                posts["parent_id"].isna() & 
                ~posts["flair"].isna()].copy()

In [None]:
top2015["target"] = top2015["flair"].isin(["Transport", "Transportation"])

In [None]:
top2015.value_counts("target")

In [None]:
pos = top2015[top2015["target"] == True]
neg = top2015[top2015["target"] == False]
data = pd.concat([pos, neg.sample(n = len(pos), random_state=42)], 
                 ignore_index=True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,1), max_df=0.7, min_df=5)
tfidf_vectors = tfidf.fit_transform(data["title"])

In [None]:
X = tfidf_vectors
y = data["target"].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_predicted = clf.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_predicted))

In [None]:
posts["text"] = posts["title"].map(str) + " " + posts["body"].map(str) + " " + posts["selftext"].map(str)

In [None]:
posts["transport"] = clf.predict(tfidf.transform(posts["text"].map(str)))

In [None]:
transport = posts[posts["transport"] == True].copy()
transport

In [None]:
transport["text"] = transport["text"].str.replace("\n", " ")
transport["text"] = transport["text"].str.replace("\r", " ")

In [None]:
transport.set_index("id")[["created_utc", "url", "parent_id", "author", "score", "text"]].\
          to_csv("transport-all-comments.csv", index_label="id")