# Exploring Gender Differences in Twitter Alcohol Data

In [13]:
import os
import json

%matplotlib inline

import matplotlib.pyplot as plt

import seaborn as sns

sns.set_style("darkgrid")

import pandas as pd
import numpy as np

from itpy.helpers import try_or

In [14]:
sns.set_style("darkgrid")
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})
plt.rcParams["figure.figsize"] = (16,8)
plt.rcParams["figure.dpi"] = 300

In [15]:
%%time
df = pd.DataFrame.from_csv("./labeled.control.dump.csv")
df["time"] = pd.to_datetime(df.created_at)

CPU times: user 2min 26s, sys: 1.17 s, total: 2min 27s
Wall time: 2min 28s


In [16]:
df = df.set_index("time")

## Preprocessing Probability distributions 

In [101]:
p = 0.75

p_firstperson = df.prediction_alcohol_svc * df.prediction_firstperson_svc

In [102]:
of_interest = p_firstperson > p

df["first_person_alcohol"] = 0
df["first_person_alcohol"][of_interest] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## Assigning Genders 

In a seperate program I've mapped first names to genders.

In [19]:
id2gender = pd.DataFrame.from_csv("./id2gender.csv")

gender_map = {
    "Other": "Other",
    "male": "Male",
    "female": "Female",
    "mostly_female": "Female",
    "mostly_male": "Male"
}

id2gender = id2gender.reset_index()
id2gender.index = id2gender.user_id.apply(try_or(lambda _: int(_), lambda _: None))

id2gender.gender = id2gender.gender.apply(gender_map.get)
id2gender = id2gender.gender.to_dict()
df["user_gender"] = df.user_id.apply(id2gender.get)

  data = self._reader.read(nrows)


In [20]:
col = ["prediction_firstperson_level_0", "prediction_firstperson_level_2", "prediction_firstperson_level_3"]
new_fp_cols = ["casual", "looking", "reflecting"]
for new_name, old_name in zip(new_fp_cols, col):
    df[new_name] = df[old_name] * df.prediction_alcohol_svc * df.prediction_firstperson_svc > 0.45

## Groupbys and Plots

In [21]:
# df = df.set_index("time")

In [22]:
def centered_95int(data):
    return 1.96 * (data.std() / np.sqrt(len(data)))

In [10]:
def plot(df, groupby_keys, groupby_label, groupby_name, writetoname, bars=1):

    temp = df.groupby(groupby_keys).agg({
            groupby_label:{
                "proportion": "mean",
                "error": centered_95int
            }
        }).unstack()
    
    temp.to_csv(writetoname)
    
    if bars:

        ax = temp[(groupby_label, "proportion")].plot(
            kind="bar", width=1, yerr=temp[groupby_label].error,
            subplots=(3, 1),
            figsize=(8, 8),
            title=groupby_name
        )
    else:
        ax = temp[(groupby_label, "proportion")].plot(
            kind="area", stacked=1, alpha=0.1,
            subplots=(3, 1),
            figsize=(16, 8),
            title=groupby_name
        )
    
    return temp

In [23]:
simple_agg = {
            "first_person_alcohol":{
                "proportion": "mean",
                "error_95": centered_95int,
                "total_tweets": "count",
            }
        }

In [24]:
pd.DataFrame(
    df.groupby([df.index.hour, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fp_gender.csv")

pd.DataFrame(
    df.groupby([df.index.hour]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fp.csv")

pd.DataFrame(
    df.groupby([df.index.day, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/day_fp_gender.csv")

pd.DataFrame(
    df.groupby([df.index.hour]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fp.csv")

pd.DataFrame(
    df.groupby([df.index.day, df.index.hour, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/dayhour_fp_gender.csv")

pd.DataFrame(
    df.groupby([df.index.day, df.index.hour]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/dayhour_fp.csv")

In [25]:
simple_agg = {
            "casual":{
                "proportion": "mean",
                "error_95": centered_95int,
                "total_tweets": "count",
            },
            "looking":{
                "proportion": "mean",
                "error_95": centered_95int,
                "total_tweets": "count",
            },
            "reflecting":{
                "proportion": "mean",
                "error_95": centered_95int,
                "total_tweets": "count",
            }
        }

In [82]:
pd.DataFrame(
    df.groupby([df.index.hour, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fpl_gender.csv")

pd.DataFrame(
    df.groupby([df.index.hour]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fpl.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/day_fpl_gender.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fpl.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek, df.index.hour, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/dayhour_fpl_gender.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek, df.index.hour]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/dayhour_fpl.csv")

In [114]:
df.groupby([df.index.dayofweek]).agg({
        "first_person_alcohol":{"first_person_alcohol":"sum", "everything": np.size}
    }).first_person_alcohol.to_csv("/Users/JasonLiu/Desktop/counts_day_everything_V_fp")

In [115]:
df.groupby([df.index.hour]).agg({
        "first_person_alcohol":{"first_person_alcohol":"sum", "everything": np.size}
    }).first_person_alcohol.to_csv("/Users/JasonLiu/Desktop/counts_hour_everything_V_fp")

In [116]:
df.groupby([df.index.dayofweek, df.index.hour]).agg({
        "first_person_alcohol":{"first_person_alcohol":"sum", "everything": np.size}
    }).first_person_alcohol.to_csv("/Users/JasonLiu/Desktop/counts_dayhour_everything_V_fp")

In [None]:
pd.DataFrame(
    df.groupby([df.index.hour]).agg(
        simple_agg
    ).unstack()
)#.to_csv("/Users/JasonLiu/Desktop/hour_fpl_gender.csv")

pd.DataFrame(
    df.groupby([df.index.hour]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fpl.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/day_fpl_gender.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/hour_fpl.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek, df.index.hour, df.user_gender]).agg(
        simple_agg
    ).unstack()
).to_csv("/Users/JasonLiu/Desktop/dayhour_fpl_gender.csv")

pd.DataFrame(
    df.groupby([df.index.dayofweek, df.index.hour]).agg(
        simple_agg
    )# .unstack()
).to_csv("/Users/JasonLiu/Desktop/dayhour_fpl.csv")