# Mentions

## Data prep

### Load the data and count.

In [8]:
import pandas as pd
import numpy as np
import logging
from dateutil.parser import parse as date_parse
from utils import load_tweet_df, tweet_type

logger = logging.getLogger()
logger.setLevel(logging.DEBUG)

# Simply the tweet on load
def mention_transform(tweet):
    mentions = []
    if 'retweeted_status' not in tweet and 'quoted_status' not in tweet:
        for mention in tweet.get('entities', {}).get('user_mentions', []):
            mentions.append({
                'tweet_id': tweet['id_str'],
                'user_id': tweet['user']['id_str'],
                'screen_name': tweet['user']['screen_name'],
                'mention_user_id': mention['id_str'],
                'mention_screen_name': mention['screen_name'],
                'tweet_created_at': date_parse(tweet['created_at'])
            })
    return mentions

mention_df = load_tweet_df(mention_transform, ['tweet_id', 'user_id', 'screen_name', 'mention_user_id',
                                           'mention_screen_name', 'tweet_created_at'])

INFO:root:['Token', 'Uid', 'Link']
INFO:root:['Token', 'Uid', 'Link']
INFO:root:['Token', 'Uid', 'Link']
INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_001.json.gz
DEBUG:root:Loaded 50000
DEBUG:root:Loaded 100000
DEBUG:root:Loaded 150000
DEBUG:root:Loaded 200000
DEBUG:root:Loaded 250000
INFO:root:Loading from tweets/6b6a0be4f70640648b56447b387f17a2_002.json.gz


### Number of mentions found in the dataset

In [9]:
mention_df[['mention_user_id']].count()

mention_user_id    57589
dtype: int64

### The mention data
Each mention consists of the tweet id, the screen name and user id that is mentioned,
and the screen_name and user_id that is mentioning.

In [None]:
mention_df.head()

### Create lookup of mentioned user ids to screen names

In [None]:
# From the mentions, extract map of user ids to screen names
mention_user_id_lookup_df = mention_df.loc[mention_df.groupby('mention_user_id')['tweet_created_at'].idxmax()].ix[:,['mention_user_id', 'mention_screen_name']].set_index(['mention_user_id'])
mention_user_id_lookup_df.count()

In [None]:
mention_user_id_lookup_df.head()

### Create lookup of user ids to screen names

In [None]:
# From the users (not the mentions), extract map of user ids to screen names
user_id_lookup_df = mention_df.loc[mention_df.groupby('user_id')['tweet_created_at'].idxmax()].ix[:,['user_id', 'screen_name']].set_index(['user_id'])
user_id_lookup_df.count()

### Group mentions by mentioned user id

In [None]:
# Group by user_id
# This count should match the user_id map count
mention_summary_user_id_df = pd.DataFrame(mention_df.groupby('mention_user_id').size(), columns=['mention_count'])
mention_summary_user_id_df.count()

In [None]:
mention_summary_user_id_df.head()

### Add back in the mention screen names

In [None]:
# Join with user id map
mention_summary_screen_name_df = mention_summary_user_id_df.join(mention_user_id_lookup_df)
mention_summary_screen_name_df.count()

In [None]:
mention_summary_screen_name_df.head()

### Add users types for mentions

In [None]:
# Load lookups of known users
from utils import load_user_type_lookup_df

user_type_lookup_df = load_user_type_lookup_df()[['type']]
user_type_lookup_df.count()

In [None]:
user_type_lookup_df.head()

In [None]:
user_type_lookup_df['type'].value_counts()

In [None]:
# Join the mentions and the known users
mention_summary_type_df = mention_summary_screen_name_df.join(user_type_lookup_df, how='left')
mention_summary_type_df['type'].fillna('unknown', inplace=True)
mention_summary_type_df.index.name = 'user_id'
mention_summary_type_df.count()

In [None]:
mention_summary_type_df.head()

### Add number of users mentioning
Which is different than the number of mentions.

In [None]:
mention_user_id_per_user_df = mention_df[['mention_user_id', 'user_id']].drop_duplicates()
mention_user_id_per_user_summary_df = pd.DataFrame(mention_user_id_per_user_df.groupby('mention_user_id').size(), columns=['users_mentioning_count'])
mention_user_id_per_user_summary_df.index.name = 'user_id'
mention_user_id_per_user_summary_df.head()
# Join with mention_summary_type_df
mention_summary_df = mention_summary_type_df.join(mention_user_id_per_user_summary_df)
mention_summary_df['percent_of_users_mentioning'] = mention_summary_df.users_mentioning_count / user_id_lookup_df['screen_name'].count()
mention_summary_df.head()


## Mention summary

### Mentions per user
For users that made any mentions. Also to possible to figure this out for all users.

In [None]:
mention_df['user_id'].value_counts().describe()

### How long is the tail?

In [None]:
mention_grouped_by_users_mentioning_df = mention_summary_df[['mention_count', 'users_mentioning_count']].groupby(by='users_mentioning_count').agg([np.sum, np.size])
mention_grouped_by_users_mentioning_df['cumulative_mention_count_sum'] = mention_grouped_by_users_mentioning_df['mention_count', 'sum'].cumsum()
mention_grouped_by_users_mentioning_df['cumulative_mention_count_sum_percentage'] = mention_grouped_by_users_mentioning_df['cumulative_mention_count_sum'] / mention_grouped_by_users_mentioning_df['mention_count', 'sum'].sum()
mention_grouped_by_users_mentioning_df['cumulative_mentioned_users'] = mention_grouped_by_users_mentioning_df['mention_count', 'size'].cumsum()
mention_grouped_by_users_mentioning_df['cumulative_mentioned_users_percentage'] = mention_grouped_by_users_mentioning_df['cumulative_mentioned_users'] / mention_grouped_by_users_mentioning_df['mention_count', 'size'].sum()
mention_grouped_by_users_mentioning_df

In [None]:
%matplotlib inline
mention_grouped_by_users_mentioning_df[['cumulative_mention_count_sum_percentage', 'cumulative_mentioned_users_percentage']].plot()

## Cut off the tail.
Removes users that were only mentioned by 1 user.

In [None]:
mention_summary_df.drop(mention_summary_df[mention_summary_df.users_mentioning_count == 1].index, inplace=True)
mention_summary_df['mention_screen_name'].count()

## Approach 1: By mention count

### Top accounts (by mention count)
Unknown for type indicates that it is not matched with an known Twitter account.

In [None]:
mention_summary_df.sort_values('mention_count', ascending=False).head(50)

### Account types (by mention count)

In [None]:
types_by_mention_count_df = mention_summary_df[['type', 'mention_count']].groupby('type').sum()
types_by_mention_count_df['type_percentage']= types_by_mention_count_df['mention_count'] / types_by_mention_count_df['mention_count'].sum()
types_by_mention_count_df.sort_values('mention_count', ascending=False)

## Approach 2: Per user
Mentions by type per user.

### Add type by merging screen name lookup

In [None]:
mention_all_join_df = pd.merge(mention_df, user_type_lookup_df[['type']], how='left', left_on='mention_user_id', right_index=True)
mention_all_join_df['type'].fillna('unknown', inplace=True)
# Drop tail
mention_all_join_limited_df = mention_all_join_df[mention_all_join_df.mention_user_id.isin(mention_summary_df.index)]
mention_all_join_limited_df.head()

In [None]:
mention_summary_by_user_df = mention_all_join_limited_df.groupby([mention_all_join_limited_df.user_id, mention_all_join_limited_df.type]).size().unstack().fillna(0)
# Add a total column
mention_summary_by_user_df['total'] = mention_summary_by_user_df.sum(axis=1)
for col_name in mention_summary_by_user_df.columns[:-1]:
    mention_summary_by_user_df['{}_percent'.format(col_name)] = mention_summary_by_user_df[col_name] / mention_summary_by_user_df.total
mention_summary_by_user_df.head(10)

### Average of percent of mentions by type for each user
That is, for each user determine the percent of mentions by type. Then take the average of each type.

Thus, this mention analysis is on a per-user basis, accounting for how prolific a tweeter a user is. (That is, users who tweet aren't weighed more heavily.)


In [None]:
mention_summary_by_user_df.filter(axis=1, regex="_percent$").mean()

## Approach 3: By count of users mentioning
The number of users that mentioned an account. Thus, each user counts as 1, even if that user made multiple mentions of the account.

This weights an account that is mentioned by a 100 users more heavily than an account that is mentioned a 100 times by a single user.

In [None]:
mention_summary_df.sort_values('users_mentioning_count', ascending=False).head(20)

### Account types (by count of users mentioning)

In [None]:
types_by_users_mentioning_df = mention_summary_df[['type', 'users_mentioning_count']].groupby('type').sum()
types_by_users_mentioning_df['type_percentage']= types_by_users_mentioning_df['users_mentioning_count'] / types_by_users_mentioning_df['users_mentioning_count'].sum()
types_by_users_mentioning_df.sort_values('users_mentioning_count', ascending=False)

## Unknown accounts
Remember, the tail has been cut off

### Number of unknown accounts

In [None]:
 mention_summary_df[mention_summary_df.type == 'unknown'].count()

### Number of known accounts

In [None]:
 mention_summary_df[mention_summary_df.type != 'unknown'].count()

### Top unknown by mention count that are mentioned by at least 5 users

In [None]:
top_not_known_mention_df = mention_summary_df[(mention_summary_df.type == 'unknown') & (mention_summary_df.users_mentioning_count >= 5)].sort_values('mention_count', ascending=False)[['mention_screen_name', 'mention_count', 'users_mentioning_count']]
top_not_known_mention_df.head(50)

### Write top accounts to file

In [None]:
top_not_known_mention_df.to_csv('unknown_mentions.csv')