In [2]:
# Two lists
content_ratings = ['4+', '9+', '12+', '17+']
numbers = [4433, 987, 1155, 622]

# A list of lists
content_rating_numbers = [['4+', '9+', '12+', '17+'],
                          [4433, 987, 1155, 622]]

What if we could transform the index numbers to content rating values? This way, the mapping between content ratings and their corresponding numbers should become much clearer.

Fortunately, we can do this using a dictionary:

In [3]:
content_ratings = {'4+': 4433, '9+': 987, '12+': 1155, '17+': 622}
print(content_ratings) # order of dictionary might be changed when it will print

{'4+': 4433, '9+': 987, '12+': 1155, '17+': 622}


With dictionaries, there's no longer a connection between the index of a value and the position of that value in the dictionary, so the order becomes unimportant

In [4]:
content_ratings = {'4+': 4433, '9+': 987, '12+': 1155, '17+': 622}
over_9 = content_ratings['9+']
over_17 = content_ratings['17+']
print(over_9)
print(over_17)

987
622


Alternatively, we can create a dictionary and populate it with values by following these steps:

We create an empty dictionary.

We add values one by one to that empty dictionary.

In [5]:
content_ratings = {} 
content_ratings['4+'] = 4433
content_ratings['9+'] = 987
content_ratings['12+'] = 1155
content_ratings['17+'] = 622

over_12_n_apps = content_ratings['12+']

Dictionary keys can be of almost any data type we've learned so far, except lists and dictionaries. If we use lists or dictionaries as dictionary keys, the computer raises an error:

In [6]:
d_1 = {'key_1': 'first_value', 
       'key_2': 2,
       'key_3': 3.14,
       'key_4': True,
       'key_5': [4,2,1],
       'key_6': {'inner_key' : 6}}

Using the `"in"` operator to check wehter key is exist. `"in"` operator will only check key. It wont check value. However "in" operator can check element in list

In [7]:
content_ratings = {'4+': 4433, '9+': 987, '12+': 1155, '17+': 622}

is_in_dictionary_1 = "9+" in content_ratings
is_in_dictionary_2 = 987 in content_ratings

print(is_in_dictionary_1)
print(is_in_dictionary_2)

True
False


In [8]:
if '17+' in content_ratings:
    result = "It exists"
    print(result)

    
# Alternative solution    
is_in_dictionary = '17+' in content_ratings

if is_in_dictionary:
    result = "It exists"
    print(result)

It exists
It exists


# we can update (change) the dictionary values. To update a dictionary value, we need to reference it by its corresponding dictionary key 

In [3]:
from csv import reader
with open("AppleStore.csv", encoding="UTF-8") as opened_file:
    read_file = reader(opened_file)
    apps_data = list(read_file)
    

In [4]:
content_ratings  = {}
for row in apps_data[1:]:
    c_rating = row[10]
    if c_rating in content_ratings:
        content_ratings[c_rating] += 1
    else:
        content_ratings[c_rating] = 1

In [5]:
content_ratings

{'4+': 4433, '12+': 1155, '9+': 987, '17+': 622}

In [6]:
# Alternative method

content_ratings = {'4+': 0, '9+': 0, '12+': 0, '17+': 0}

for row in apps_data:
    c_rating = row[10]
    if c_rating in content_ratings:
        content_ratings[c_rating] += 1
    
print(content_ratings)  

{'4+': 4433, '9+': 987, '12+': 1155, '17+': 622}


The number of times a unique value occurs is also called **frequency**

When we're analyzing frequencies, we might be interested in answering questions about proportions and percentages:

What proportion of apps have a content rating of 4+?

What percentage of apps have a content rating of 17+?

What percentage of apps can a 15-year-old download?

In [7]:
genre_counting = {}

for row in apps_data[1:]:
    genre = row[11]    
    if genre in genre_counting:
        genre_counting[genre] += 1
    else:
        genre_counting[genre] = 1
        
print(genre_counting)

# The most common app genre is 'Games' (there are 3862 gaming apps)

{'Social Networking': 167, 'Photo & Video': 349, 'Games': 3862, 'Music': 138, 'Reference': 64, 'Health & Fitness': 180, 'Weather': 72, 'Utilities': 248, 'Travel': 81, 'Shopping': 122, 'News': 75, 'Navigation': 46, 'Lifestyle': 144, 'Entertainment': 535, 'Food & Drink': 63, 'Sports': 114, 'Book': 112, 'Finance': 104, 'Education': 453, 'Productivity': 178, 'Business': 57, 'Catalogs': 10, 'Medical': 23}


# Transform frequencies to proportions or percentages

In [8]:
total_number_of_apps = 7197
for key in content_ratings:
    content_ratings[key] /= total_number_of_apps
    content_ratings[key] *= 100
content_ratings    

{'4+': 61.595109073224954,
 '9+': 13.714047519799916,
 '12+': 16.04835348061692,
 '17+': 8.642489926358204}

In [9]:
# percentage of apps that can be downloaded by a 15-year-old

percentage_15_allowed = content_ratings["4+"] + content_ratings["9+"] + content_ratings["12+"]
percentage_15_allowed 

91.35751007364179

We transformed frequencies to proportions or percentages by overwriting the initial dictionary values.

However, we'll often need to keep the dictionaries separate for later analysis.

We can create a new dictionary instead of overwriting the values in the initial dictionary.

In [10]:
genre_counting_percentage = {}

for key in genre_counting:
    proportion = genre_counting[key]/total_number_of_apps
    percentage = proportion * 100
    genre_counting_percentage[key] = percentage
    
genre_counting_percentage    

{'Social Networking': 2.3204112824788106,
 'Photo & Video': 4.849242740030569,
 'Games': 53.66124774211477,
 'Music': 1.9174656106711132,
 'Reference': 0.8892594136445742,
 'Health & Fitness': 2.501042100875365,
 'Weather': 1.0004168403501459,
 'Utilities': 3.4458802278727245,
 'Travel': 1.1254689453939142,
 'Shopping': 1.6951507572599693,
 'News': 1.0421008753647354,
 'Navigation': 0.6391552035570377,
 'Lifestyle': 2.0008336807002918,
 'Entertainment': 7.433652910935113,
 'Food & Drink': 0.8753647353063776,
 'Sports': 1.5839933305543976,
 'Book': 1.5562039738780047,
 'Finance': 1.445046547172433,
 'Education': 6.294289287203002,
 'Productivity': 2.473252744198972,
 'Business': 0.7919966652771988,
 'Catalogs': 0.1389467833819647,
 'Medical': 0.31957760177851885}

# A lengthy frequency table is difficult to analyze. The lengthier the table, the harder it becomes to see any patterns. As a workaround, we can create well-defined intervals and count the frequency for those intervals instead

In [17]:
n_user_ratings = []
for row in apps_data[1:]:
    n_user_ratings.append(int(row[5]))
    
ratings_max = max(n_user_ratings)
ratings_min = min(n_user_ratings)
print(ratings_max)
print(ratings_min)

2974676
0


In [19]:
user_ratings_freq = {"0 - 10000":0, "10000 - 100000":0, "100000 - 500000":0, "500000 - 1000000": 0, "1000000+":0}

In [22]:
for row in apps_data[1:]:
    user_ratings = int(row[5])
    
    if user_ratings <= 10000:
        user_ratings_freq['0 - 10000'] += 1
        
    elif 10000 < user_ratings <= 100000:
        user_ratings_freq['10000 - 100000'] += 1
        
    elif 100000 < user_ratings <= 500000:
        user_ratings_freq['100000 - 500000'] += 1
        
    elif 500000 < user_ratings <= 1000000:
        user_ratings_freq['500000 - 1000000'] += 1
        
    elif user_ratings > 1000000:
        user_ratings_freq['1000000+'] += 1


print(user_ratings_freq)

{'0 - 10000': 18543, '10000 - 100000': 2394, '100000 - 500000': 588, '500000 - 1000000': 48, '1000000+': 18}


We learned about dictionaries and focused on how to use them to build frequency tables. Frequency tables are common in data science practice