# Descriptive statistics

## 1 Setup

Flags

In [1]:
SAVE_FIGURES = False

Setup the database

In [2]:
import os, sys
sys.path.insert(1, os.path.abspath('../..'))
import analysis

FIG = os.path.join(os.path.abspath(os.path.curdir), '{}.png')
DB_NAME = 'spreadr_' + os.path.split(os.path.abspath(os.path.curdir))[1]
analysis.setup(DB_NAME)
print('Database:', DB_NAME)

Database: spreadr_pilot_8c


Imports for the analysis

In [3]:
%matplotlib inline
import matplotlib.pyplot as plt
from nltk.metrics import edit_distance
import numpy as np
from pandas import DataFrame
import seaborn as sb
from django.db.models import Count

from gists.models import (Sentence, Profile, GistsConfiguration,
                          JOB_TYPE_CHOICES, EDUCATION_LEVEL_CHOICES)

from analysis.utils import memoized

config = GistsConfiguration.get_solo()

## 2 Despamming

Compute spam rate

In [4]:
spam_rate = 1 - (Sentence.objects.nonspam.count() / Sentence.objects.count())
print('Spam represents {:.1f}% of the sentences'.format(spam_rate * 100))

Spam represents 46.6% of the sentences


In [5]:
dropped_rate = 1 - (Sentence.objects.kept.count() / Sentence.objects.count())
print('Spam+rogues+doubleposts represents {:.1f}% of the sentences'.format(dropped_rate * 100))

Spam+rogues+doubleposts represents 46.6% of the sentences


This is a lot! Because the `read_factor` was way too short.

## 3 Interaction of profile variables

Compute the variables

In [None]:
profiles = profiles = Profile.objects\
    .annotate(Count('sentences'))\
    .filter(sentences__count=config.experiment_work + config.training_work,
            user__is_staff=False)
data = []
job_type_map = dict(JOB_TYPE_CHOICES)
education_level_map = dict(EDUCATION_LEVEL_CHOICES)
print('Computing variables on {} profiles...'.format(len(profiles)), end='')
for profile in profiles:
    print(' #{}'.format(profile.id), end='')
    try:
        ed_level_id = int(profile.questionnaire.education_level)
    except ValueError:
        ed_level_id = np.NaN
    data.append({
        'raw TR': profile.transformation_rate('raw'),
        'ordered content TR': profile.transformation_rate('oc'),
        'age': profile.questionnaire.age,
        'gender': profile.questionnaire.gender,
        'job type': job_type_map[profile.questionnaire.job_type],
        'ed level': education_level_map[profile.questionnaire.education_level],
        'ed level id': ed_level_id,
    })

profiles_interactions = DataFrame(data, index=[p.id for p in profiles])

And plot them, first with gender as color (**not much to say to these graphs**):

In [None]:
sb.pairplot(profiles_interactions, hue='gender',
            vars=['raw TR',
                  'ordered content TR',
                  'age'],
            kind='reg');

Then with job type as color (**not much to say to this either**):

In [None]:
sb.pairplot(profiles_interactions, hue='job type',
            vars=['raw TR',
                  'ordered content TR',
                  'age'],
            kind='reg');

Then with education level as color (**not much to say to this either**):

In [None]:
sb.pairplot(profiles_interactions, hue='ed level',
            vars=['raw TR',
                  'ordered content TR',
                  'age'],
            kind='reg');

A boxplot of the interaction between gender, job type, and transformation rate (**again, not much to say to this**):

In [None]:
ax = sb.boxplot(data=profiles_interactions, x='job type', y='ordered content TR',
                hue='gender', palette='Paired')
ax.set_xticklabels([t.get_text() for t in ax.get_xticklabels()], rotation=60);

And another boxplot of the interaction between gender, education level, and transformation rate (**again, not much to say to this**):

In [None]:
ax = sb.boxplot(data=profiles_interactions, x='ed level', y='ordered content TR',
                hue='gender', palette='Paired')
ax.set_xticklabels([t.get_text() for t in ax.get_xticklabels()], rotation=60);

Interaction of education level and transformation rate?

In [None]:
sb.regplot(data=profiles_interactions, x='ed level id', y='ordered content TR')

... none.

Age?

In [None]:
sb.regplot(data=profiles_interactions, x='age', y='ordered content TR')

Nope. Or very slight, getting better (lower TR) with age!