In [1]:
import pandas as pd
import os

In [2]:
cwd = os.chdir('doximity')

ext_views = pd.read_csv('external_profile_views.csv.gz', compression='gzip')
int_views = pd.read_csv('internal_profile_views.csv.gz', compression='gzip')
titles = pd.read_csv('titles.csv.gz', compression='gzip')
users = pd.read_csv('users.csv.gz', compression='gzip')
user_prof_urls = pd.read_csv('user_profile_urls.csv.gz', compression='gzip')

In [3]:
counts = pd.DataFrame([int_views.shape[0], ext_views['unique_PageViews'].sum()])
counts.rename(columns={0:"Raw Count"}, inplace=True)
counts['Pct of Total'] = counts.apply(lambda x: round(100*x/counts.sum()[0],2))
counts.pivot_table(index=['internal','external'])
#External views make up 99% of all views

Unnamed: 0,Pct of Total,Raw Count
external,99.19,1318492
internal,0.81,10806


In [13]:
# sum internal counts by user
int_counts = int_views.groupby(['viewed_user_id']).size().reset_index(name='internal_views')

#we can see there are some superstar profiles, with the top one getting almost 4% of internal views
print("Top profile's internal views: {}".format(int_counts.internal_views.max()))

Top profile's internal views: 406


In [6]:
#binning age for ease of analysis
bins = [x for x in range(users['age'].min(),users['age'].max()+10,10)]
labels = [str(bins[i]) + '-' + str(bins[i+1]) for i in range(len(bins)-1)]
users['age_bin'] = pd.cut(users['age'], bins=bins, labels=labels, include_lowest=True)

In [7]:
#merge everything together to look at dimensions of interest: age, region, and title
titles.rename(columns={'id':'title_id'}, inplace=True)
merged = users.merge(titles, on='title_id').drop('title_id', axis=1)
int_counts.rename(columns={'viewed_user_id':'id'}, inplace=True)
merged = merged.merge(int_counts, on='id')

In [19]:
title_sum = merged[['internal_views','title']].groupby(['title']).agg({"internal_views": [("total views", "sum"), ("group size", "count")]})
title_sum['average views']=title_sum['internal_views']['total views']/title_sum['internal_views']['group size']
age_sum = merged[['internal_views','age_bin']].groupby(['age_bin']).agg({"internal_views": [("total views", "sum"), ("group size", "count")]})
age_sum['average views']=age_sum['internal_views']['total views']/age_sum['internal_views']['group size']
region_sum = merged[['internal_views','region']].groupby(['region']).agg({"internal_views": [("total views", "sum"), ("group size", "count")]})
region_sum['average views']=region_sum['internal_views']['total views']/region_sum['internal_views']['group size']

# we can see that tenured professors get far more internal views than other job titles, focus on growing their ranks to drive
# internal traffic
title_sum

Unnamed: 0_level_0,internal_views,internal_views,average views
Unnamed: 0_level_1,total views,group size,Unnamed: 3_level_1
title,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Assistant professor,738,471,1.566879
Full professor,4310,1003,4.297109
Other,717,513,1.397661
Postdoctoral assistant,2118,1419,1.4926
Student,2923,2058,1.420311


At first glance, it looked like users in user_profile_views had multiple user IDs (under the assumption that a URL is unique, otherwise how would you ensure accurate routing / avoid url collisions?). Looking at the users table, I can see that there are multiple users with the same last name and IDs that match up to user_prof_urls. 
This demonstrates that duplicate URLs are allowed, which means we can't merge merge external stats to users on profile_url as I'd hoped. External views are most likely aggregated per unique URL, since there's only one entry for '/pub/david-brown', a URL that had many associated users. Since there's no way to match to a specific user, we'll run some stats on the internal views then. This is unfortunately rather limiting given that external views make up the lion's share of all views.

In [29]:
#reset merged because it got aggregated earlier
merged = users.merge(titles, on='title_id').drop('title_id', axis=1)

#Since we have the id of both the viewed user and the visitor, we can explore whether people similar to the 
#viewed user are driving traffic.

add_viewee_dims = int_views.merge(merged, left_on='viewed_user_id', right_on='id')

# now let's merge the viewer info as well
click_demographics = add_viewee_dims.merge(merged, left_on='user_id', right_on='id', suffixes=('_viewed','_guest'))

In [30]:
title_matrix = pd.crosstab(click_demographics.title_viewed, click_demographics.title_guest)
display(title_matrix)
#we can see most of the professor views come from students and postdocs, focus on retaining professors
# since their profiles drive views, grow the academic base as well to drive that interaction

age_matrix = pd.crosstab(click_demographics.age_bin_viewed, click_demographics.age_bin_guest)
display(age_matrix)
#viewers of all age groups are primarily looking at 20s-30s profiles

region_matrix = pd.crosstab(click_demographics.region_viewed, click_demographics.region_guest)
#views of regions are fairly balanced, except for other, not sure what that represents
display(region_matrix)

title_guest,Assistant professor,Full professor,Other,Postdoctoral assistant,Student
title_viewed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Assistant professor,42,22,95,241,338
Full professor,325,128,548,1395,1914
Other,50,16,87,226,338
Postdoctoral assistant,139,74,269,707,929
Student,208,84,343,933,1355


age_bin_guest,20-30,30-40,40-50,50-60,60-70,70-80,80-90
age_bin_viewed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
20-30,2855,1098,421,351,806,105,63
30-40,920,359,128,114,282,26,12
40-50,366,142,65,53,116,13,10
50-60,341,116,57,49,78,9,4
60-70,783,322,117,110,191,33,18
70-80,81,30,16,10,26,3,4
80-90,48,21,10,5,17,2,0


region_guest,Midwest,Northeast,Other,South,West
region_viewed,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Midwest,643,739,327,568,631
Northeast,573,696,319,549,643
Other,192,198,90,161,194
South,557,559,233,455,528
West,445,472,215,382,437


In [46]:
{
   "codeCellConfig": {
      "lineWrap": "on",
      "wordWrapColumn": 75
   }
}

{'codeCellConfig': {'lineWrap': 'on', 'wordWrapColumn': 75}}

# Summary
### Demographic trends and the drivers/beneficiaries of traffic
#### Looking at 3 main dimensions: 'age_bin', 'region', and 'title'
    - Full professors have an average click rate 2-3x other titles, driven primarily by post-docs and students
    - Profiles of 20-30 year olds are viewed most by all age groups  
    - Regional differences in view rate are not particularly compelling
   
#### Total views
    - External views comprise 99% of all views
    - It's a little strange for a site to have so much inbound traffic but little internal activity. My first 
    course of action would to investigate whether there's some impediment/bug on the sign-up page
    
#### Takeaway
    - Grow internal views, outside traffic is far harder to monetize and has short dwell times, making any ads sold to them rather low margin.
    - Focus on tenured professors since they drive traffic; concurrently, ensure a solid student/post-doc base 
    because they drive the views. You can't have one without the other.
    - For the ranking feature, I would use the coocurrance logic to display people with demographics that align with the viewer's typical browsing interest. Alongside ought to be their most impactful action/post in terms of view growth if computationally feasible. This will spur people to pursue those actions, increasing the share of internal views (think LinkedIn's superstar ranking where they prod you to complete your profile to get views).