In [16]:
import pandas as pd
import numpy as np
import pyreadstat
import re
from datetime import datetime
import pyreadstat
import json

In [17]:
pd.options.display.max_columns = 100

#### Geospatial data

We would like to visualize the spread of vegetarian restaurants across different states in the United States. So we extracted the coloumns called PROVINCE. We then used US state mapper to map the state names to its abbreviations which is in the data. We created a new data frame which contains only two columns - Province and the number of restaurants in each province. The columns have been renamed to region and value to be compatible to the library that we are going to use. 

In [18]:
df_1 = pd.read_csv("../project_data/vegetarian_restaurants_US_datafiniti.csv")

In [19]:
df_state = df_1.groupby(by='province').agg({'name':'nunique'})

In [20]:
df_state.reset_index(inplace=True)

In [21]:
    us_state_abbrev = {
        'AL': 'Alabama',
        'AK': 'Alaska',
        'AZ': 'Arizona',
        'AR': 'Arkansas',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'HI': 'Hawaii',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'IA': 'Iowa',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'ME': 'Maine',
        'MD': 'Maryland',
        'MA': 'Massachusetts',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MS': 'Mississippi',
        'MO': 'Missouri',
        'MT': 'Montana',
        'NE': 'Nebraska',
        'NV': 'Nevada',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NY': 'New York',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VT': 'Vermont',
        'VA': 'Virginia',
        'WA': 'Washington',
        'WV': 'West Virginia',
        'WI': 'Wisconsin',
        'WY': 'Wyoming',
    }

In [None]:
def func(x, us_):
    return us_[x].lower()

In [None]:
df_state.rename(columns={'province':'region','name':'value'},inplace=True)

In [None]:
df_state = df_state[df_state['region']!="AU-WA"]
df_state = df_state[df_state['region']!="DC"]

In [None]:
df_state['region'] = df_state['region'].apply(lambda x: func(x,us_state_abbrev))

In [None]:
df_state.to_csv('../project_data/states_grouped.csv',index=False)

### Reading .sav extension files

In [3]:
df, meta = pyreadstat.read_sav("../project_data/respondents.sav")

In [4]:
df1, meta = pyreadstat.read_sav("../project_data/Faunalytics-Current-Former-Veg-Study-Dataset-All-Respondents-Original-Variables.sav")

In [5]:
df_reasons = df[['id','ALLANIMALPROTECTION','ALLENVIRONMENT','ALLCOST','ALLHEALTH','ALLRELIGIOUSSPIRITUAL',
                'ALLSOCIALINFLUENCE','ALLSOCIALJUSTICEWORLDHUNGER','ALLFOODTREND','ALLDISGUST']]

In [6]:
df_reasons.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reasons.dropna(inplace=True)


In [7]:
df_reasons.drop(columns=['id'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [8]:
df_reasons = df_reasons.apply(pd.Series.value_counts)

In [9]:
df_reasons = df_reasons.transpose()

In [10]:
df_reasons = df_reasons.rename(columns={2.0:"Very little",3.0:"little",1.0:"Not at all",4.0:"high",5.0:"Very High"})

In [11]:
df_modified = df_reasons.transpose()

In [12]:
df_modified.reset_index(inplace=True)

In [13]:
df_modified = df_modified.to_dict('list')

In [14]:
f = open("../project_data/interactive_plots.json", "w")
json.dump(df_modified,f)
f.close()

### Inconvience because of veganism

In [15]:
df.shape

(1387, 1023)

In [51]:
df_inconvience = df[['id','ALLINCONVENIENCE1','ALLINCONVENIENCE2','ALLINCONVENIENCE3','ALLINCONVENIENCE4','ALLINCONVENIENCE5','ALLINCONVENIENCE6',
   'ALLINCONVENIENCE7']]

In [52]:
def function_transform(x):
    if int(x)==2:
        return 1
    elif int(x)==4:
        return 5
    else:
        return int(x)

In [53]:
df_inconvience[df_inconvience['ALLINCONVENIENCE1'].isna() == True]

Unnamed: 0,id,ALLINCONVENIENCE1,ALLINCONVENIENCE2,ALLINCONVENIENCE3,ALLINCONVENIENCE4,ALLINCONVENIENCE5,ALLINCONVENIENCE6,ALLINCONVENIENCE7
25,264.0,,,,,,,
80,776.0,,,,,,,
152,1302.0,,,,,,,
178,1483.0,,,,,,,
229,1919.0,,,,,,,
303,2560.0,,,,,,,
351,3048.0,,,,,,,
393,3583.0,,,,,,,
407,3768.0,,,,,,,
574,5137.0,,,,,,,


In [54]:
df_inconvience[df_inconvience['ALLINCONVENIENCE3'].isna() == True]

Unnamed: 0,id,ALLINCONVENIENCE1,ALLINCONVENIENCE2,ALLINCONVENIENCE3,ALLINCONVENIENCE4,ALLINCONVENIENCE5,ALLINCONVENIENCE6,ALLINCONVENIENCE7
25,264.0,,,,,,,
80,776.0,,,,,,,
152,1302.0,,,,,,,
178,1483.0,,,,,,,
229,1919.0,,,,,,,
303,2560.0,,,,,,,
351,3048.0,,,,,,,
393,3583.0,,,,,,,
407,3768.0,,,,,,,
574,5137.0,,,,,,,


In [55]:
df_inconvience.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inconvience.dropna(inplace=True)


In [56]:
df_inconvience['ALLINCONVENIENCE1'] = df_inconvience['ALLINCONVENIENCE1'].apply(lambda x:function_transform(x))

df_inconvience['ALLINCONVENIENCE2'] = df_inconvience['ALLINCONVENIENCE2'].apply(lambda x:function_transform(x))

df_inconvience['ALLINCONVENIENCE3'] = df_inconvience['ALLINCONVENIENCE3'].apply(lambda x:function_transform(x))

df_inconvience['ALLINCONVENIENCE4'] = df_inconvience['ALLINCONVENIENCE4'].apply(lambda x:function_transform(x))

df_inconvience['ALLINCONVENIENCE5'] = df_inconvience['ALLINCONVENIENCE5'].apply(lambda x:function_transform(x))

df_inconvience['ALLINCONVENIENCE6'] = df_inconvience['ALLINCONVENIENCE6'].apply(lambda x:function_transform(x))

df_inconvience['ALLINCONVENIENCE7'] = df_inconvience['ALLINCONVENIENCE7'].apply(lambda x:function_transform(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inconvience['ALLINCONVENIENCE1'] = df_inconvience['ALLINCONVENIENCE1'].apply(lambda x:function_transform(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inconvience['ALLINCONVENIENCE2'] = df_inconvience['ALLINCONVENIENCE2'].apply(lambda x:function_transform(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-vi

In [57]:
df_inconvience.rename(columns={'ALLINCONVENIENCE1':'complication in preparing','ALLINCONVENIENCE2':'time consuming',
                              'ALLINCONVENIENCE3':'no restuarants for grab n go','ALLINCONVENIENCE4':'easy access',
                              'ALLINCONVENIENCE5':'have to prepare both vegatarian and non vegetarian',
                              'ALLINCONVENIENCE6':'hard time grocery shopping','ALLINCONVENIENCE7':'difficult during time of transition'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [58]:
df_inconvience.drop(columns=['id'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [59]:
df_inconvience = df_inconvience.apply(pd.Series.value_counts)

In [60]:
df_inconvience = df_inconvience.transpose()

In [61]:
df_inconvience = df_inconvience.rename(columns={1:"Disagree",5:"Neither agree nor disagree",3:"Agree"})
df_inconvience.reset_index(inplace=True)

In [62]:
df_inconvience.rename(columns={'index':'reason'},inplace=True)

In [63]:
df_inconvience

Unnamed: 0,reason,Disagree,Agree,Neither agree nor disagree
0,complication in preparing,804,285,280
1,time consuming,686,312,371
2,no restuarants for grab n go,521,291,557
3,easy access,828,254,287
4,have to prepare both vegatarian and non vegeta...,801,175,393
5,hard time grocery shopping,1082,158,129
6,difficult during time of transition,698,315,356


In [64]:
df_inconvience.to_csv('../project_data/inconvience_reasons.csv',index=False,encoding='utf-8')

### Transition and length of veganism and what age groups are feeling a difficulty of transition

In [61]:
df_categorical = df[['id','ALLAGEADOPTION','ALLTRANSITION','ALLLENGTH']]

In [62]:
def func_age(x):
    if x>=0 and x<=20:
        return "0-20"
    elif x>20 and x<=40:
        return "20-40"
    elif x>40 and x<=60:
        return "40-60"
    elif x>60 and x<=80:
        return "60-80"
    else:
        return "80-100"

In [63]:
df_categorical['Age'] = df_categorical['ALLAGEADOPTION'].apply(lambda x: func_age(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical['Age'] = df_categorical['ALLAGEADOPTION'].apply(lambda x: func_age(x))


In [64]:
df_age_transition = df_categorical[['id','Age']]

In [65]:
df_merge = pd.merge(df_age_transition, df[['id','ALLINCONVENIENCE7']],left_on="id",right_on="id")

In [66]:
df_merge.dropna(inplace=True)

In [67]:
df_merge['id'] = df_merge['id'].apply(lambda x:int(x))

In [68]:
df_merge['ALLINCONVENIENCE7'] = df_merge['ALLINCONVENIENCE7'].apply(lambda x:int(x))

In [69]:
df_merge.rename(columns={'ALLINCONVENIENCE7':'difficulty_during_transition'},inplace=True)

In [70]:
df_merge.to_csv("../project_data/age_difficulty_during_transition.csv",index=False)

In [71]:
df_categorical.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical.dropna(inplace=True)


In [72]:
df_categorical['ALLLENGTH'] = df_categorical['ALLLENGTH'].apply(lambda x:int(x))
df_categorical['ALLTRANSITION'] = df_categorical['ALLTRANSITION'].apply(lambda x:int(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical['ALLLENGTH'] = df_categorical['ALLLENGTH'].apply(lambda x:int(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical['ALLTRANSITION'] = df_categorical['ALLTRANSITION'].apply(lambda x:int(x))


In [73]:
def function_for_transition_length(x):
    if x==1:
        return "0-3 months"
    elif x==2:
        return "4-11 months"
    elif x==3:
        return "1-2 years"
    elif x==4:
        return "3-5 years"
    elif x==5:
        return "6-9 years"
    elif x==6:
        return "9 or more years"
    else:
        return "No idea"

In [74]:
df_categorical['ALLLENGTH'] = df_categorical['ALLLENGTH'].apply(lambda x:function_for_transition_length(x))
df_categorical['ALLTRANSITION'] = df_categorical['ALLTRANSITION'].apply(lambda x:function_for_transition_length(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical['ALLLENGTH'] = df_categorical['ALLLENGTH'].apply(lambda x:function_for_transition_length(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_categorical['ALLTRANSITION'] = df_categorical['ALLTRANSITION'].apply(lambda x:function_for_transition_length(x))


In [75]:
df_categorical.rename(columns={'ALLTRANSITION':'Time_for_transition','ALLLENGTH':'Length_of_veganism'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [76]:
df_categorical.to_csv('../project_data/age_transition_legth.csv',index=False,encoding='utf-8')

### Time series data

In [1]:
def func(x):
    try:
        x = datetime.fromisoformat(x[:-1])
        x = x.strftime('%Y-%m')
    except Exception as e:
        x = None
    return x

In [None]:
df_1['date'] = df_1['dateAdded'].apply(lambda x:func(x))

In [None]:
df_time_series = df_1['date'].value_counts().rename_axis('date').reset_index(name='counts')

In [None]:
df_time_series.to_csv('../project_data/timeseries_opening_of_restuarants.csv',index=False)