# Data Cleaning - First 1000 row

In [1]:
#Import the needed libraries
import pandas as pd

In [2]:
df = pd.read_csv('dataset/test_v2.csv',nrows=1000)

In [3]:
df.dtypes

channelGrouping         object
customDimensions        object
date                     int64
device                  object
fullVisitorId           uint64
geoNetwork              object
hits                    object
socialEngagementType    object
totals                  object
trafficSource           object
visitId                  int64
visitNumber              int64
visitStartTime           int64
dtype: object

**Data Definition**

* **fullVisitorId** - A unique identifier for each user of the Google Merchandise Store.
* **channelGrouping** - The channel via which the user came to the Store.
* **date** - The date on which the user visited the Store.
* **device** - The specifications for the device used to access the Store.
* **geoNetwork** - This section contains information about the geography of the user.
* **socialEngagementType** - Engagement type, either "Socially Engaged" or "Not Socially Engaged".
* **totals** - This section contains aggregate values across the session.
* **trafficSource** - This section contains information about the Traffic Source from which the session originated.
* **visitId** - An identifier for this session. This is part of the value usually stored as the _utmb cookie. This is only unique to the user. For a completely unique ID, you should use a combination of fullVisitorId and visitId.
* **visitNumber** - The session number for this user. If this is the first session, then this is set to 1.
* **visitStartTime** - The timestamp .
* **hits** - This row and nested fields are populated for any and all types of hits. Provides a record of all page visits.
* **customDimensions** - This section contains any user-level or session-level custom dimensions that are set for a session. This is a repeated field and has an entry for each dimension that is set.
* **totals** - This set of columns mostly includes high-level aggregate data.

In [4]:
df.head()

Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7460955084541987166,"{""continent"": ""Asia"", ""subContinent"": ""Souther...","[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526099341,2,1526099341
1,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",460252456180441002,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526064483,166,1526064483
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3461808543879602873,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '12',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526067157,2,1526067157
3,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",975129477712150630,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526107551,4,1526107551
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Internet Explorer"", ""browserVersi...",8381672768065729990,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526060254,1,1526060254


In [5]:
df.columns

Index(['channelGrouping', 'customDimensions', 'date', 'device',
       'fullVisitorId', 'geoNetwork', 'hits', 'socialEngagementType', 'totals',
       'trafficSource', 'visitId', 'visitNumber', 'visitStartTime'],
      dtype='object')

From the first glance, we can see that there are a value list in some column. We can try to seperate those into another columns.

First, we can see with the **customDimensions** columns that it's a JSON type but it's in a list. So we have to flatten out the list first

There are some way we can use, we can use the manual way :

In [6]:
#Get the value in index
index = {}
for i in range(len(df['customDimensions'])):
    if df.loc[i,'customDimensions']=="[]":
        index_value = "null"
    else:         
        index_value = df.loc[i,'customDimensions'].split(" 'value':")[0].replace("[{'index': ","").replace("'","").replace(",","")
    index[i]=index_value

In [7]:
#Get the value in value
value = {}
for i in range(len(df['customDimensions'])):
    if df.loc[i,'customDimensions']=="[]":
        value_value = "null"
    else:         
        value_value = df.loc[i,'customDimensions'].split("[{'index': '4', 'value': ")[1].replace("'}]","").replace("'","")
    value[i]=value_value

or using json libraries

In [8]:
import json
customDimensions={}
for j in json.loads(df.loc[0,'customDimensions'].replace("[","").replace("]","").replace("'",'"')).keys():
    col_val = {}
    for i in range(len(df)):
        if df.loc[i,'customDimensions']=="[]":
            value = "null"
        else:
            value = df.loc[i,'customDimensions'].replace("[","").replace("]","").replace("'",'"')
            value = json.loads(value)
            value = value[j]
        col_val[i]=value
    customDimensions[j]=col_val

In [9]:
for i in customDimensions.keys():
    a = "customDimensions_"+i
    df[a] = pd.DataFrame.from_dict(customDimensions[i],orient='index')

In [10]:
df.head()

Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime,customDimensions_index,customDimensions_value
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7460955084541987166,"{""continent"": ""Asia"", ""subContinent"": ""Souther...","[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526099341,2,1526099341,4,APAC
1,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",460252456180441002,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526064483,166,1526064483,4,North America
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3461808543879602873,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '12',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526067157,2,1526067157,4,North America
3,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",975129477712150630,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526107551,4,1526107551,4,North America
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Internet Explorer"", ""browserVersi...",8381672768065729990,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526060254,1,1526060254,4,North America


Okay so we're done with the first column, now we can do the same thing for device, geoNetwork, hits, totals, and trafficSource. I made a function to change the format from json and then a function to change the dictionary into dataframe and enter the values into a new column in the original dataset

**Changing data from json type function**

In [11]:
def change_json(df,column):
    import json
    sub_column={}
    if df.loc[0,column][0]=="[":
        data = df.loc[0,column].replace("[","").replace("]","").replace("'",'"')
    else:
        data = df.loc[0,column]
    for j in json.loads(data).keys():
        col_val = {}
        for i in range(len(df)):
            if df.loc[i,column]=="[]":
                value = "null"
            else:
                value = json.loads(df.loc[i,column])
                value = value[j]
            col_val[i]=value
        sub_column[j]=col_val
    return sub_column

**From dictionary to new column in dataset**

In [12]:
def dict_to_df(column):
    for i in column.keys():
        df[i] = pd.DataFrame.from_dict(column[i],orient='index')

In [13]:
device = change_json(df,'device')
dict_to_df(device)

In [14]:
pd.set_option('display.max_columns',100)
df.head()

Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime,customDimensions_index,customDimensions_value,browser,browserVersion,browserSize,operatingSystem,operatingSystemVersion,isMobile,mobileDeviceBranding,mobileDeviceModel,mobileInputSelector,mobileDeviceInfo,mobileDeviceMarketingName,flashVersion,language,screenColors,screenResolution,deviceCategory
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7460955084541987166,"{""continent"": ""Asia"", ""subContinent"": ""Souther...","[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526099341,2,1526099341,4,APAC,Chrome,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile
1,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",460252456180441002,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526064483,166,1526064483,4,North America,Chrome,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3461808543879602873,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '12',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526067157,2,1526067157,4,North America,Chrome,not available in demo dataset,not available in demo dataset,Chrome OS,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop
3,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",975129477712150630,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526107551,4,1526107551,4,North America,Chrome,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Internet Explorer"", ""browserVersi...",8381672768065729990,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526060254,1,1526060254,4,North America,Internet Explorer,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,tablet


Now we do the rest of the variables. First lets do the same for geoNetwork

In [15]:
#geoNetwork
val = change_json(df,'geoNetwork')
dict_to_df(val)

In [16]:
df.head()

Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime,customDimensions_index,customDimensions_value,browser,browserVersion,browserSize,operatingSystem,operatingSystemVersion,isMobile,mobileDeviceBranding,mobileDeviceModel,mobileInputSelector,mobileDeviceInfo,mobileDeviceMarketingName,flashVersion,language,screenColors,screenResolution,deviceCategory,continent,subContinent,country,region,metro,city,cityId,networkDomain,latitude,longitude,networkLocation
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7460955084541987166,"{""continent"": ""Asia"", ""subContinent"": ""Souther...","[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526099341,2,1526099341,4,APAC,Chrome,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile,Asia,Southern Asia,India,Delhi,(not set),(not set),not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset
1,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",460252456180441002,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526064483,166,1526064483,4,North America,Chrome,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,Americas,Northern America,United States,California,San Francisco-Oakland-San Jose CA,San Francisco,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,not available in demo dataset
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3461808543879602873,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '12',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526067157,2,1526067157,4,North America,Chrome,not available in demo dataset,not available in demo dataset,Chrome OS,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,Americas,Northern America,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,onlinecomputerworks.com,not available in demo dataset,not available in demo dataset,not available in demo dataset
3,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",975129477712150630,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526107551,4,1526107551,4,North America,Chrome,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile,Americas,Northern America,United States,Texas,Houston TX,Houston,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,not available in demo dataset
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Internet Explorer"", ""browserVersi...",8381672768065729990,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526060254,1,1526060254,4,North America,Internet Explorer,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,tablet,Americas,Northern America,United States,California,Los Angeles CA,Irvine,not available in demo dataset,com,not available in demo dataset,not available in demo dataset,not available in demo dataset


In [17]:
val = change_json(df,'hits')
dict_to_df(val)

JSONDecodeError: Expecting value: line 1 column 80 (char 79)

We can see when we're entering the hits column into the function, it gives an error. We can see that this hits column have a nested dictionary. Looks like we cannot convert the column to json with json.loads so I tried to do in the manual way

1. get all column name because each index have different length -> that what makes it error
-> get the most length out of all index -> note that there are possibilities the next 1000 row of data have larger index 
-> or to stay safe, get the least length so there will not be a lot of missing value
2. after that, match/search the value that match the column name

In [121]:
df.loc[2,'hits'].replace("[","").replace("]","").replace("'",'"')

'{"hitNumber": "1", "time": "0", "hour": "12", "minute": "32", "isInteraction": True, "referer": "https://www.google.com/", "page": {"pagePath": "/google+redesign/apparel/womens/womens+t+shirts", "hostname": "shop.googlemerchandisestore.com", "pageTitle": "Women"s T-Shirts | Apparel | Google Merchandise Store", "pagePathLevel1": "/google+redesign/", "pagePathLevel2": "/apparel/", "pagePathLevel3": "/womens/", "pagePathLevel4": "/womens+t+shirts"}, "transaction": {"currencyCode": "USD"}, "item": {"currencyCode": "USD"}, "appInfo": {"screenName": "shop.googlemerchandisestore.com/google+redesign/apparel/womens/womens+t+shirts", "landingScreenName": "shop.googlemerchandisestore.com/google+redesign/apparel/womens/womens+t+shirts", "exitScreenName": "shop.googlemerchandisestore.com/google+redesign/apparel/womens", "screenDepth": "0"}, "exceptionInfo": {"isFatal": True}, "eventInfo": {"eventCategory": "Enhanced Ecommerce", "eventAction": "Add to Cart"}, "product": {"productSKU": "GGOEGXXX0903

We can see that we have nested dictionary so we have to clean it up.

In [120]:
import re
d = df.loc[:,'hits'].replace("[","").replace("]","").replace("'",'"')
d = d.loc[9,'hits']
d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
d = d.replace("{","").replace("}","").replace('"',"")
length = len(d.split(','))
print(length,d)

IndexingError: Too many indexers

481

In [64]:
d = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')
d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
length = len(d.split(','))

test={}

for i in range(length):
    d = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')
    d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
    d = d.replace("{","").replace("}","").replace('"',"")
    d = d.split(',')[i].split(':')[-1]
    test[i]=d

In [105]:
d = df.loc[2,'hits'].replace("[","").replace("]","").replace("'",'"')
d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
d.split(',')[100]

' {"hitNumber": "3"'

In [104]:
import re
d = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')
d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
length = len(d.split(','))

hits={}

for i in range(length):
    sum_value={}
    for index in range(2):
        d = df.loc[index,'hits'].replace("[","").replace("]","").replace("'",'"')
        d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
        d = d.replace("{","").replace("}","").replace('"',"")
        d = d.split(',')[i].split(':')[-1]
        sum_value[index]=d
    key = df.loc[i,'hits'].replace("[","").replace("]","").replace("'",'"')
    key = re.sub(' "[a-zA-Z]{2,20}": {' ,"",key)
    key = key.replace("{","").replace("}","").replace('"',"")
    key = key.split(",")[i].split(":")[0]
    hits[key]=sum_value

IndexError: list index out of range

In [103]:
hits

{'hitNumber': {0: ' 1'},
 ' time': {0: ' (not set)'},
 ' hour': {0: ' True'},
 ' minute': {0: ' Brand Row 7-1'},
 ' isInteraction': {0: ' shop.googlemerchandisestore.com/home'},
 ' isEntrance': {0: ' /home'},
 'pagePath': {0: ' '},
 ' hostname': {0: ' PAGE'},
 ' pagePathLevel1': {0: ' mens-tshirts.jpg'},
 ' pagePathLevel2': {0: ' '},
 ' pagePathLevel4': {0: ' home_main_link_apparel.jpg'},
 'screenName': {0: ' '},
 ' landingScreenName': {0: ' shop.googlemerchandisestore.com'},
 ' exitScreenName': {0: ' (not set)'},
 'isFatal': {0: ' (not set)'},
 ' v2ProductName': {0: ' (not set)'},
 ' v2ProductCategory': {0: ' '},
 ' productVariant': {0: ' womens-tshirts.jpg'},
 ' productBrand': {0: ' Brand Row 7-3'},
 ' productPrice': {0: ' (not set)'},
 ' localProductPrice': {0: ' (not set)'},
 ' isImpression': {0: ' Drinkware'},
 ' productCouponCode': {0: ' Home'},
 ' customMetrics': {0: ' (not set)'},
 ' productListName': {0: ' '},
 ' productListPosition': {0: ' (not set)'},
 ' productSKU': {0: ' M

In [35]:
d = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')
d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)

import re
length = len(d.split(','))-1

hits={}
for i in range(length):
    sum_value={}
    for index in range(len(df)):
        d = df.loc[index,'hits'].replace("[","").replace("]","").replace("'",'"')
        d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
        d = d.replace("{","").replace("}","")
        value = d.replace('"',"").split(",")[i].split(":")[-1]
        sum_value[index]=value
    key = df.loc[i,'hits'].replace("[","").replace("]","").replace("'",'"')
    key = re.sub(' "[a-zA-Z]{2,20}": {' ,"",key)
    key = key.replace("{","").replace("}","").replace('"',"")
    key = key.split(",")[i].split(":")[0]
    hits[key]=sum_value

IndexError: list index out of range

In [58]:
t = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')

import re
t = re.findall(' "[a-zA-Z]{2,20}": {' ,t)
t[0].replace('"',"").replace(" ","").replace(":{","")

'page'

In [78]:
t = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')

import re
t = re.sub(' "[a-zA-Z]{2,20}": {' ,"",t)
t.replace("{","").replace("}","").replace('"',"").split(",")[1].split(":")[-1]

' 0'

In [103]:
d = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')
d = re.sub(' "[a-zA-Z]{2,20}": {' ,"",d)
d.replace("{","").replace("}","").replace('"',"").split(",")[401].split(":")[-1]

' '

In [138]:
dict_to_df(hits)

In [144]:
df.head()

Unnamed: 0,channelGrouping,customDimensions,date,device,fullVisitorId,geoNetwork,hits,socialEngagementType,totals,trafficSource,visitId,visitNumber,visitStartTime,customDimensions_index,customDimensions_value,browser,browserVersion,browserSize,operatingSystem,operatingSystemVersion,isMobile,mobileDeviceBranding,mobileDeviceModel,mobileInputSelector,mobileDeviceInfo,mobileDeviceMarketingName,flashVersion,language,screenColors,screenResolution,deviceCategory,continent,subContinent,country,region,metro,city,cityId,networkDomain,latitude,longitude,networkLocation,"""hitNumber""","""time""","""hour""","""minute""","""isInteraction""","""isEntrance""","""page""","""hostname""",...,"""productBrand""","""productPrice""","""localProductPrice""","""isImpression""","""productCouponCode""","""customMetrics""","""productListName""","""productListPosition""","""productSKU""","""experiment""","""previousContentGroup2""","""previousContentGroup3""","""previousContentGroup4""","""previousContentGroup5""","""dataSource""","""publisher_infos""",hitNumber,time,hour,minute,isInteraction,isEntrance,page,hostname,pagePathLevel1,pagePathLevel2,pagePathLevel4,appInfo,landingScreenName,exitScreenName,exceptionInfo,v2ProductName,v2ProductCategory,productVariant,productBrand,productPrice,localProductPrice,isImpression,productCouponCode,customMetrics,productListName,productListPosition,productSKU,experiment,previousContentGroup2,previousContentGroup3,previousContentGroup4,previousContentGroup5,dataSource,publisher_infos
0,Organic Search,"[{'index': '4', 'value': 'APAC'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",7460955084541987166,"{""continent"": ""Asia"", ""subContinent"": ""Souther...","[{'hitNumber': '1', 'time': '0', 'hour': '21',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526099341,2,1526099341,4,APAC,Chrome,not available in demo dataset,not available in demo dataset,Android,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile,Asia,Southern Asia,India,Delhi,(not set),(not set),not available in demo dataset,unknown.unknown,not available in demo dataset,not available in demo dataset,not available in demo dataset,"""1""","""0""","""21""","""29""",True,"""pagePath""","""Home""","""/home""",...,"""action_type""","""1""",,,"""No""",,"""EVENT""","""socialNetwork""","""","""(not set)""","""(not set)""","""(not set)""","""(not set)""","""(entrance)""","""(entrance)""","""(entrance)""",1,0,21,29,True,pagePath,Home,/home,,,shop.googlemerchandisestore.com/home,shop.googlemerchandisestore.com/home,eventCategory,Promotion Click,promoId,contentGroup1,Category Row 2,promoIsClick,action_type,1,,,No,,EVENT,socialNetwork,,(not set),(not set),(not set),(not set),(entrance),(entrance),(entrance)
1,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",460252456180441002,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '11',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526064483,166,1526064483,4,North America,Chrome,not available in demo dataset,not available in demo dataset,Macintosh,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,Americas,Northern America,United States,California,San Francisco-Oakland-San Jose CA,San Francisco,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,not available in demo dataset,"""1""","""0""","""11""","""48""",True,True,"""shop.googlemerchandisestore.com""","""Home""",...,"""home_bags_google_2.jpg""","""Row 2 Combo""","""Mens T-Shirts Row 3-1""","""Mens T-Shirts""","""womens-tshirts.jpg""","""Row 3-1""","""Womens T-Shirts Row 3-2""","""Womens T-Shirts""","""Row 3-2""","""Office""","""green_row_link_to_office.jpg""","""Row 5 Color Combo""","""Drinkware Row 4 Color Combo""","""Drinkware""","""red_row_hydrate.jpg""","""Row 4 Color Combo""",1,0,11,48,True,True,shop.googlemerchandisestore.com,Home,,,screenName,shop.googlemerchandisestore.com/home,isFatal,,Apparel,Office Row 5 Color Combo,Backpacks Row 2 Combo,Backpacks,home_bags_google_2.jpg,Row 2 Combo,Mens T-Shirts Row 3-1,Mens T-Shirts,womens-tshirts.jpg,Row 3-1,Womens T-Shirts Row 3-2,Womens T-Shirts,Row 3-2,Office,green_row_link_to_office.jpg,Row 5 Color Combo,Drinkware Row 4 Color Combo,Drinkware,red_row_hydrate.jpg,Row 4 Color Combo
2,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",3461808543879602873,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '12',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""4"", ""pageviews"": ""3"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526067157,2,1526067157,4,North America,Chrome,not available in demo dataset,not available in demo dataset,Chrome OS,not available in demo dataset,False,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,desktop,Americas,Northern America,United States,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,onlinecomputerworks.com,not available in demo dataset,not available in demo dataset,not available in demo dataset,"""1""","""0""","""12""","""32""",True,"""https","""shop.googlemerchandisestore.com""","""Women""s T-Shirts | Apparel | Google Merchand...",...,"""(not set)""","""(not set)""","""21990000""","""21990000""","""0""",,,"""(not set)""","""(not set)""","""action_type""","""1""",,,,,"""EVENT""",1,0,12,32,True,https,shop.googlemerchandisestore.com,Womens T-Shirts | Apparel | Google Merchandis...,/apparel/,/womens/,currencyCode,currencyCode,shop.googlemerchandisestore.com/google+redesi...,0,eventCategory,,Google Tee White,Home/Apparel/Womens/Womens-T-Shirts/,(not set),(not set),21990000,21990000,0,,,(not set),(not set),action_type,1,,,,,EVENT
3,Direct,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Chrome"", ""browserVersion"": ""not a...",975129477712150630,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '23',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526107551,4,1526107551,4,North America,Chrome,not available in demo dataset,not available in demo dataset,iOS,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,mobile,Americas,Northern America,United States,Texas,Houston TX,Houston,not available in demo dataset,(not set),not available in demo dataset,not available in demo dataset,not available in demo dataset,"""1""","""0""","""23""","""45""",True,True,"""pagePath""","""shop.googlemerchandisestore.com""",...,"""Backpacks""","""home_bags_google_2.jpg""","""Row 2 Combo""","""Mens T-Shirts Row 3-1""","""Womens T-Shirts""","""mens-tshirts.jpg""","""Row 3-1""","""Womens T-Shirts Row 3-2""","""womens-tshirts.jpg""","""Office Row 5 Color Combo""","""Office""","""green_row_link_to_office.jpg""","""Row 5 Color Combo""","""Drinkware Row 4 Color Combo""","""Drinkware""","""red_row_hydrate.jpg""",1,0,23,45,True,True,pagePath,shop.googlemerchandisestore.com,/home,,,screenName,0,isFatal,promoId,Row 3-2,Row 1,Backpacks Row 2 Combo,Backpacks,home_bags_google_2.jpg,Row 2 Combo,Mens T-Shirts Row 3-1,Womens T-Shirts,mens-tshirts.jpg,Row 3-1,Womens T-Shirts Row 3-2,womens-tshirts.jpg,Office Row 5 Color Combo,Office,green_row_link_to_office.jpg,Row 5 Color Combo,Drinkware Row 4 Color Combo,Drinkware,red_row_hydrate.jpg
4,Organic Search,"[{'index': '4', 'value': 'North America'}]",20180511,"{""browser"": ""Internet Explorer"", ""browserVersi...",8381672768065729990,"{""continent"": ""Americas"", ""subContinent"": ""Nor...","[{'hitNumber': '1', 'time': '0', 'hour': '10',...",Not Socially Engaged,"{""visits"": ""1"", ""hits"": ""5"", ""pageviews"": ""4"",...","{""referralPath"": ""(not set)"", ""campaign"": ""(no...",1526060254,1,1526060254,4,North America,Internet Explorer,not available in demo dataset,not available in demo dataset,Windows,not available in demo dataset,True,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,not available in demo dataset,tablet,Americas,Northern America,United States,California,Los Angeles CA,Irvine,not available in demo dataset,com,not available in demo dataset,not available in demo dataset,not available in demo dataset,"""1""","""0""","""10""","""37""",True,True,"""pagePath""","""shop.googlemerchandisestore.com""",...,"""Backpacks""","""home_bags_google_2.jpg""","""Row 2 Combo""","""Mens T-Shirts Row 3-1""","""Womens T-Shirts""","""mens-tshirts.jpg""","""Row 3-1""","""Womens T-Shirts Row 3-2""","""womens-tshirts.jpg""","""Office Row 5 Color Combo""","""Office""","""green_row_link_to_office.jpg""","""Row 5 Color Combo""","""Drinkware Row 4 Color Combo""","""Drinkware""","""red_row_hydrate.jpg""",1,0,10,37,True,True,pagePath,shop.googlemerchandisestore.com,/home,,,screenName,0,isFatal,promoId,Row 3-2,Row 1,Backpacks Row 2 Combo,Backpacks,home_bags_google_2.jpg,Row 2 Combo,Mens T-Shirts Row 3-1,Womens T-Shirts,mens-tshirts.jpg,Row 3-1,Womens T-Shirts Row 3-2,womens-tshirts.jpg,Office Row 5 Color Combo,Office,green_row_link_to_office.jpg,Row 5 Color Combo,Drinkware Row 4 Color Combo,Drinkware,red_row_hydrate.jpg


In [143]:
df.loc[3,'hits'].replace("[","").replace("]","").replace("'",'"').replace("{","").replace("}","")

'"hitNumber": "1", "time": "0", "hour": "23", "minute": "45", "isInteraction": True, "isEntrance": True, "referer": "https://www.googlemerchandisestore.com/", "page": "pagePath": "/home", "hostname": "shop.googlemerchandisestore.com", "pageTitle": "Home", "pagePathLevel1": "/home", "pagePathLevel2": "", "pagePathLevel3": "", "pagePathLevel4": "", "appInfo": "screenName": "shop.googlemerchandisestore.com/home", "landingScreenName": "shop.googlemerchandisestore.com/home", "exitScreenName": "shop.googlemerchandisestore.com/google+redesign/apparel/mens/quickview", "screenDepth": "0", "exceptionInfo": "isFatal": True, "product": , "promotion": "promoId": "Apparel Row 1", "promoName": "Apparel", "promoCreative": "home_main_link_apparel.jpg", "promoPosition": "Row 1", "promoId": "Backpacks Row 2 Combo", "promoName": "Backpacks", "promoCreative": "home_bags_google_2.jpg", "promoPosition": "Row 2 Combo", "promoId": "Mens T-Shirts Row 3-1", "promoName": "Mens T-Shirts", "promoCreative": "mens-ts

In [150]:
d = df.loc[0,'hits'].replace("[","").replace("]","").replace("'",'"')
d.replace('"',"").split(",")[7].split(":")[1]

' Home'

In [151]:
d

'{"hitNumber": "1", "time": "0", "hour": "21", "minute": "29", "isInteraction": True, "page": {"pagePath": "/home", "hostname": "shop.googlemerchandisestore.com", "pageTitle": "Home", "pagePathLevel1": "/home", "pagePathLevel2": "", "pagePathLevel3": "", "pagePathLevel4": ""}, "appInfo": {"screenName": "shop.googlemerchandisestore.com/home", "landingScreenName": "shop.googlemerchandisestore.com/home", "exitScreenName": "shop.googlemerchandisestore.com/home", "screenDepth": "0"}, "exceptionInfo": {"isFatal": True}, "eventInfo": {"eventCategory": "Enhanced Ecommerce", "eventAction": "Promotion Click"}, "product": , "promotion": {"promoId": "Category Row 2", "promoName": "Accessories", "promoCreative": "toy.png", "promoPosition": "Category Row 2"}, "promotionActionInfo": {"promoIsClick": True}, "eCommerceAction": {"action_type": "0", "step": "1"}, "experiment": , "customVariables": , "customDimensions": , "customMetrics": , "type": "EVENT", "social": {"socialNetwork": "(not set)", "hasSoc