In [1]:
%pylab inline
import pandas as pd

Populating the interactive namespace from numpy and matplotlib


# Load the train data

In [2]:
train_data = pd.read_csv("../input/datamining2020/train_data.csv", encoding="utf8")

In [3]:
train_data.head()

Unnamed: 0,author,subreddit,created_utc,body
0,Shamus_Aran,mylittlepony,1388534000.0,I don't think we'd get nearly as much fanficti...
1,Riddance,sex,1388534000.0,"Thanks. I made it up, that's how I got over my..."
2,Secret_Wizard,DragonsDogma,1388534000.0,Are you sure you aren't confusing Cyclops (the...
3,Penultimatum,malefashionadvice,1388534000.0,dont do this to me bro
4,7-SE7EN-7,todayilearned,1388534000.0,That's what we do when we can't find a mate


In [4]:
train_data.author.unique().shape

(5000,)

In [5]:
target = pd.read_csv("../input/datamining2020/train_target.csv")

In [6]:
target.head()

Unnamed: 0,author,gender
0,RedThunder90,0
1,Lirkmor,1
2,In0chi,0
3,ProjectGrudge,0
4,TehTurtleHermit,0


# Feature Extraction

In [7]:
subreddits = train_data.subreddit.unique()
subreddits_map = pd.Series(index=subreddits, data=arange(subreddits.shape[0]))

In [8]:
from scipy import sparse

In [9]:
def extract_features(group):
    group_subreddits = group['subreddit'].values
    idxs = subreddits_map[group_subreddits].values
    v = sparse.dok_matrix((1, subreddits.shape[0]))
    for idx in idxs:
        if not np.isnan(idx):
            v[0, idx] = 1
    return v.tocsr()

extract_features(train_data[train_data.author=='RedThunder90'])

<1x3468 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [10]:
features_dict = {}

for author, group in train_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [11]:
X = sparse.vstack([features_dict[author] for author in target.author])
X

<5000x3468 sparse matrix of type '<class 'numpy.float64'>'
	with 49152 stored elements in Compressed Sparse Row format>

In [12]:
y = target.gender

In [13]:
def extract_text(group):
    group_text = group['body'].values
    return " ".join(group_text)

extract_text(train_data[train_data.author=='RedThunder90'])

'I still prefer to buy foods either grown locally or where animals are treated better, but this definitely has me looking at organic food differently.'

In [14]:
text_dict = {}

for author, group in train_data.groupby('author'):
    text_dict[author] = extract_text(group)

In [15]:
author_text = [text_dict[author] for author in target.author]
author_text[0][:100]

'I still prefer to buy foods either grown locally or where animals are treated better, but this defin'

# Model Selection

In [16]:
# YOUR CODE HERE

class Model():
    def predict_proba(self, X):
        return np.zeros((X.shape[0], 2))
    
model = Model()

# Prepare the solution

In [17]:
test_data = pd.read_csv("../input/datamining2020/test_data.csv", encoding="utf8")

In [18]:
features_dict = {}

for author, group in test_data.groupby('author'):
    features_dict[author] = extract_features(group)

In [19]:
X_test = sparse.vstack([features_dict[author] for author in test_data.author.unique()])
X_test

<15000x3468 sparse matrix of type '<class 'numpy.float64'>'
	with 144898 stored elements in Compressed Sparse Row format>

In [20]:
y_pred = model.predict_proba(X_test)[:,1]

In [21]:
solution = pd.DataFrame({"author":test_data.author.unique(), "gender":y_pred})
solution.head()

Unnamed: 0,author,gender
0,ejchristian86,0.0
1,ZenDragon,0.0
2,savoytruffle,0.0
3,hentercenter,0.0
4,rick-o-suave,0.0


In [22]:
# solution.to_csv("solution.csv", index=False)