In [14]:
import pandas as pd
from pathlib import Path
import sys
sys.path.append("..")

data_path = Path('../data/writers.csv')
df = pd.read_csv(data_path)

  interactivity=interactivity, compiler=compiler, result=result)


### How would we vectorize tabular fields?

Let's pretend we wanted to predict the **score** from the tags, number of comments, and question creation date

In [15]:
df['is_question'] = df['PostTypeId'] == 1

tabular_df = df[df["is_question"]][["Tags", "CommentCount", "CreationDate", "Score"]]
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,32
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,20
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,34
3,<plot><short-story><planning><brainstorming>,0,2010-11-18T20:43:59.693,28
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,21


Normalize numerical input features. 

Get dummy variables from categorical features

Extract structure from date features.

In [16]:

from ml_editor.data_processing import get_normalized_series

tabular_df["NormComment"]= get_normalized_series(tabular_df, "CommentCount")
tabular_df["NormScore"]= get_normalized_series(tabular_df, "Score")

In [17]:
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score,NormComment,NormScore
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,32,0.165706,0.140501
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,20,-0.103524,0.077674
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,34,-0.065063,0.150972
3,<plot><short-story><planning><brainstorming>,0,2010-11-18T20:43:59.693,28,-0.103524,0.119558
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,21,-0.065063,0.082909


In [18]:
tabular_df['date'] = pd.to_datetime(tabular_df['CreationDate'])

tabular_df['year'] = tabular_df['date'].dt.year
tabular_df['month'] = tabular_df['date'].dt.month
tabular_df['day'] = tabular_df['date'].dt.day
tabular_df['hour'] = tabular_df['date'].dt.hour

In [19]:
tabular_df.head()

Unnamed: 0,Tags,CommentCount,CreationDate,Score,NormComment,NormScore,date,year,month,day,hour
0,<resources><first-time-author>,7,2010-11-18T20:40:32.857,32,0.165706,0.140501,2010-11-18 20:40:32.857,2010,11,18,20
1,<fiction><grammatical-person><third-person>,0,2010-11-18T20:42:31.513,20,-0.103524,0.077674,2010-11-18 20:42:31.513,2010,11,18,20
2,<publishing><novel><agent>,1,2010-11-18T20:43:28.903,34,-0.065063,0.150972,2010-11-18 20:43:28.903,2010,11,18,20
3,<plot><short-story><planning><brainstorming>,0,2010-11-18T20:43:59.693,28,-0.103524,0.119558,2010-11-18 20:43:59.693,2010,11,18,20
4,<fiction><genre><categories>,1,2010-11-18T20:45:44.067,21,-0.065063,0.082909,2010-11-18 20:45:44.067,2010,11,18,20


In [20]:
# Select our tags, represented as strings, and transform them into arrays of tags
tags = tabular_df["Tags"]
clean_tags = tags.str.split("><").apply(
    lambda x: [a.strip("<").strip(">") for a in x])

# Use pandas' get_dummies to get dummy values 
# select only tags that appear over 500 times
tag_columns = pd.get_dummies(clean_tags.apply(pd.Series).stack()).sum(level=0)
all_tags = tag_columns.astype(bool).sum(axis=0).sort_values(ascending=False)
top_tags = all_tags[all_tags > 500]
top_tag_columns = tag_columns[top_tags.index]


In [21]:
top_tag_columns.head()

Unnamed: 0,creative-writing,fiction,style,characters,technique,novel,publishing
0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0
2,0,0,0,0,0,1,1
3,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0


In [22]:
# Add our tags back into our initial DataFrame
final = pd.concat([tabular_df, top_tag_columns], axis=1)

# Keeping only the vectorized features
col_to_keep = ["year", "month", "day", "hour", "NormComment",
               "NormScore"] + list(top_tags.index)
final_features = final[col_to_keep]

In [23]:
all_tags

creative-writing            1351
fiction                     1253
style                        991
characters                   609
technique                    549
novel                        531
publishing                   529
character-development        419
plot                         405
technical-writing            345
dialogue                     318
fantasy                      306
editing                      289
academic-writing             277
formatting                   273
copyright                    267
structure                    265
self-publishing              235
word-choice                  234
legal                        234
citations                    230
description                  210
short-story                  205
screenwriting                190
software                     188
non-fiction                  174
tools                        170
planning                     159
science-fiction              154
resources                    154
          