In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as st
from scipy.stats import linregress
import random

In [2]:
#import csv file and create dataframe
csv_path  = "Resources/OSF_socialmedia_data.csv"
social_media_df = pd.read_csv(csv_path)
social_media_df.head()

Unnamed: 0.1,Unnamed: 0,Participant,Date,Day,Time,Session.Name,Notification.No,LifePak.Download.No,Responded,Completed.Session,...,Loneliness,Concentrat,LossOfInt,Inferior,Hopeless,Stress,PSMU,AutoPSMU,News,Active
0,1,115091,05/02/2017,Tuesday,10:00:06,Daily questions,1,1,1,1,...,61.0,64.0,68.0,50.0,56.0,73.0,66.0,66,39.0,38.0
1,2,115091,05/02/2017,Tuesday,12:02:48,Daily questions,2,1,1,1,...,57.0,70.0,58.0,37.0,56.0,67.0,0.0,#skipped#,27.0,23.0
2,3,115091,05/02/2017,Tuesday,14:03:02,Daily questions,3,1,1,1,...,44.0,70.0,67.0,26.0,38.0,39.0,63.0,56,34.0,36.0
3,4,115091,05/02/2017,Tuesday,16:34:28,Daily questions,4,1,1,1,...,56.0,79.0,89.0,30.0,20.0,60.0,67.0,68,29.0,23.0
4,5,115091,05/02/2017,Tuesday,18:00:00,Daily questions,5,1,0,0,...,,,,,,,,,,


In [3]:
#count to determine missing values
social_media_df.count()


Unnamed: 0                         12245
Participant                        12245
Date                               12245
Day                                12245
Time                               12245
Session.Name                       12245
Notification.No                    12245
LifePak.Download.No                12245
Responded                          12245
Completed.Session                  12245
Session.Instance                    8695
Session.Instance.Response.Lapse     8695
Reminders.Delivered                12245
Instr_DQs                              0
Fatigue                             8653
DeprMood                            8648
Loneliness                          8646
Concentrat                          8645
LossOfInt                           8646
Inferior                            8646
Hopeless                            8650
Stress                              8649
PSMU                                8646
AutoPSMU                            8651
News            

In [4]:
#cleaned up data 
social_media_df.dropna(how = "all")


cleaned_social_media = social_media_df.drop(['LifePak.Download.No','Session.Name', 'Session.Instance.Response.Lapse', 'Instr_DQs', 'Unnamed: 0'], axis = 1)
cleaned_social_media = cleaned_social_media.dropna(axis = 0, how = 'any')
cleaned_social_media.head(10)


cleaned_social_media.to_csv('Resources/cleaned_data.csv')

In [5]:
study_began = cleaned_social_media["Date"].min()
study_began

'03/28/2017'

In [6]:
study_end = cleaned_social_media["Date"].max()
study_end

'05/26/2017'

In [7]:
unique_participants = cleaned_social_media["Participant"].drop_duplicates().count()
unique_participants

125

In [8]:
#remove data before first mass shooting during the timeframe
new_cleaned_social_media = cleaned_social_media.loc[((cleaned_social_media["Date"]> "04/16/2017"))] 
new_cleaned_social_media

Unnamed: 0,Participant,Date,Day,Time,Notification.No,Responded,Completed.Session,Session.Instance,Reminders.Delivered,Fatigue,...,Loneliness,Concentrat,LossOfInt,Inferior,Hopeless,Stress,PSMU,AutoPSMU,News,Active
0,115091,05/02/2017,Tuesday,10:00:06,1,1,1,1.0,0,61.0,...,61.0,64.0,68.0,50.0,56.0,73.0,66.0,66,39.0,38.0
1,115091,05/02/2017,Tuesday,12:02:48,2,1,1,2.0,0,28.0,...,57.0,70.0,58.0,37.0,56.0,67.0,0.0,#skipped#,27.0,23.0
2,115091,05/02/2017,Tuesday,14:03:02,3,1,1,3.0,0,24.0,...,44.0,70.0,67.0,26.0,38.0,39.0,63.0,56,34.0,36.0
3,115091,05/02/2017,Tuesday,16:34:28,4,1,1,4.0,0,63.0,...,56.0,79.0,89.0,30.0,20.0,60.0,67.0,68,29.0,23.0
5,115091,05/02/2017,Tuesday,20:02:12,6,1,1,5.0,0,16.0,...,0.0,27.0,83.0,15.0,3.0,25.0,16.0,10,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12240,9873359,04/25/2017,Tuesday,14:00:09,3,1,1,81.0,0,15.0,...,0.0,14.0,0.0,0.0,0.0,14.0,22.0,0,0.0,0.0
12241,9873359,04/25/2017,Tuesday,16:40:58,4,1,1,82.0,0,22.0,...,0.0,11.0,0.0,0.0,0.0,0.0,6.0,0,0.0,24.0
12242,9873359,04/25/2017,Tuesday,18:00:06,5,1,1,83.0,0,4.0,...,0.0,10.0,0.0,0.0,0.0,0.0,11.0,0,0.0,18.0
12243,9873359,04/25/2017,Tuesday,20:05:31,6,1,1,84.0,0,0.0,...,8.0,22.0,8.0,0.0,0.0,15.0,10.0,0,0.0,0.0


In [9]:
new_cleaned_social_media["Date"].min()

'04/17/2017'

In [10]:
unique_participants2 = new_cleaned_social_media["Participant"].drop_duplicates().count()
unique_participants2

122

In [11]:
nw_count_df = new_cleaned_social_media.groupby(["Date","Participant"])["News"]
daily_news_stats_df = nw_count_df.describe()
daily_news_stats_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Date,Participant,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
04/17/2017,438907,4.0,37.75,32.622845,0.0,15.75,42.5,64.5,66.0
04/17/2017,572172,6.0,43.666667,20.42221,19.0,26.5,46.0,61.0,65.0
04/17/2017,696084,5.0,42.0,34.102786,7.0,11.0,38.0,77.0,77.0
04/17/2017,1102830,4.0,26.0,30.419292,0.0,0.0,23.0,49.0,58.0
04/17/2017,1274514,6.0,24.5,17.896927,0.0,13.75,24.5,34.5,50.0


In [12]:
max_news_entry = daily_news_stats_df["count"].max()
max_news_entry

10.0

In [13]:
drop_newssurvey_entries = daily_news_stats_df.loc[((daily_news_stats_df["count"]>4))] 
drop_newssurvey_entries

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Date,Participant,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
04/17/2017,572172,6.0,43.666667,20.422210,19.0,26.50,46.0,61.0,65.0
04/17/2017,696084,5.0,42.000000,34.102786,7.0,11.00,38.0,77.0,77.0
04/17/2017,1274514,6.0,24.500000,17.896927,0.0,13.75,24.5,34.5,50.0
04/17/2017,1318587,5.0,10.400000,16.577093,0.0,0.00,0.0,14.0,38.0
04/17/2017,1500743,6.0,27.500000,23.805462,0.0,10.75,24.5,40.5,64.0
...,...,...,...,...,...,...,...,...,...
05/24/2017,5625691,5.0,41.000000,38.457769,0.0,0.00,54.0,74.0,77.0
05/24/2017,7949149,6.0,57.333333,3.141125,53.0,55.00,58.5,59.0,61.0
05/24/2017,8353768,5.0,7.200000,8.843076,0.0,1.00,3.0,11.0,21.0
05/24/2017,8810136,5.0,30.400000,22.611944,0.0,22.00,26.0,45.0,59.0


In [14]:
average_rate_news_entry = drop_newssurvey_entries.loc[((drop_newssurvey_entries["min"]>49.9))] 
average_rate_news_entry.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,count,mean,std,min,25%,50%,75%,max
Date,Participant,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
04/17/2017,4188405,7.0,66.571429,4.894117,57.0,66.0,67.0,68.5,73.0
04/17/2017,4850438,5.0,50.0,0.0,50.0,50.0,50.0,50.0,50.0
04/19/2017,4188405,7.0,68.0,5.91608,62.0,63.5,67.0,72.0,76.0
04/19/2017,6716111,7.0,76.714286,4.644505,71.0,74.0,76.0,79.0,84.0
04/20/2017,4188405,7.0,59.571429,2.819997,57.0,57.5,59.0,60.5,65.0
04/20/2017,6716111,5.0,85.4,7.46994,76.0,82.0,83.0,92.0,94.0
04/21/2017,6716111,5.0,81.8,5.167204,74.0,81.0,81.0,86.0,87.0
04/22/2017,4188405,7.0,64.142857,1.676163,63.0,63.0,63.0,65.0,67.0
04/24/2017,5663344,5.0,66.2,7.79102,55.0,63.0,68.0,69.0,76.0
04/25/2017,4188405,7.0,62.714286,1.253566,61.0,62.0,62.0,64.0,64.0
