<a href="https://colab.research.google.com/github/he-yilan/language-census-data/blob/main/States.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Census data study of linguistic diversity

In [243]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [245]:
from google.colab import files
import pandas as pd
import os
import csv
import re

In [246]:
os.chdir('/content/drive/MyDrive/Ling55AC/assignment2/states') # the path where you store your csv files for all states

## Population of users of a certain language in each state

In [247]:
state_files = os.listdir()

In [248]:
def to_list(this_path):
  """
  Compiles all data from CSV file into a list of lists. 
  Each list is a row from the original CSV. 
  """
  lines = list()
  remove= [1,2,3,4]

  with open(this_path, 'r') as read_file:
      reader = csv.reader(read_file)
      for row_number, row in enumerate(reader, start=1):
          if(row_number not in remove):
              lines.append(row)

  with open('new_csv.csv', 'w') as write_file:
      writer = csv.writer(write_file)
      writer.writerows(lines)
  return lines

In [249]:
def get_lang_list(lang_name, lang_matrix):
  """
  Extracts the list from the matrix that holds the data about the language we are interested in. 
  """
  for lang_list in lang_matrix:
    if re.search(".*" + lang_name, lang_list[0]) is not None: 
      return lang_list

In [250]:
def get_all_states_speakers(lang_name):
  """
  Iterates through all the states and finds the population of lang_name speakers in that state. 
  Returns a list of tuples, matching the state to its population of speakers. 
  """
  result = []
  for file in state_files: 
    state_list = to_list(file)
    lang_list = get_lang_list(lang_name, state_list)
    if lang_list is None: 
        num_speakers = 0
    elif lang_list[1] != '(D)':
      num_speakers = int(lang_list[1])
    else:
      num_speakers = 0
    result.append((file, num_speakers))
  return result

In [251]:
def sorted_states(lang_name):
  """
  Gets the populations of lang_name speakers in each state, 
  compiles them into a list, and then sorts the list in descending. 
  """
  lang_states = get_all_states_speakers(lang_name)
  return sorted(lang_states, key=lambda x: x[1], reverse = True)

In [252]:
nepali = sorted_states("Nepali")
navajo = sorted_states("Navajo")
yiddish = sorted_states("Yiddish")

In [253]:
lang_names = ["Nepali", "Navajo", "Yiddish"] # names of 3 languages

In [254]:
lang_state_lists = [nepali, navajo, yiddish] # number of users of the 3 languages in all states

## Percentage of national population

In [255]:
os.chdir('/content/drive/MyDrive/Ling55AC/assignment2') # the path where you store your csv files for national data

In [256]:
us_data = to_list("US.csv")

In [257]:
def get_national_speakers(lang_name):
  """
  Gets the total number of people in the US who speak a certain language
  """
  us_pop = get_lang_list(lang_name, us_data)
  for i in range(len(us_pop)): 
    us_pop[i] = us_pop[i].replace(",", "")
  return int(us_pop[1])

In [258]:
def get_state_percentage(lang_name, sorted_state_list, index): 
  """
  Gets the percentage of speakers in the US who resides in a certain state (index). 
  """
  num_speakers_us = get_national_speakers(lang_name)
  num_speakers_state = sorted_state_list[index][1]
  return num_speakers_state / num_speakers_us

In [261]:
def get_top_percentages(): 
  """
  Gets the percentage of speakers residing in the top 3 states. 
  Returns the top 3 states for each language. 
  """
  result = []
  for i in range(3): 
    top = []
    l = lang_names[i]
    lang_state_list = lang_state_lists[i]
    print(l)
    top.append(l)
    top.extend([lang_state_list[0][0], lang_state_list[1][0], lang_state_list[2][0]])
    for j in range(3): 
      percentage = get_state_percentage(l, lang_state_list, j)
      percentage = percentage * 100
      print("{p}% , of users reside in the top {rank} state".format(p = str(percentage), rank = j+1))
    result.append(top)
    print()
  return result

## Percentage of a state total population

In [263]:
os.chdir('/content/drive/MyDrive/Ling55AC/assignment2/states') # the path where you store your csv files for all states

In [264]:
nepali_states = tops[0][1:]
navajo_states = tops[1][1:]
yiddish_states = tops[2][1:]

In [265]:
def get_state_population(state_path):
  """
  Gets the total population of a state
  """
  state_list = to_list(state_path)
  for list in state_list: 
    if re.search(".*" + 'Population 5 years and over', list[0]) is not None: 
      return int(list[1])

In [266]:
def get_top_states_populations(path_list): 
  """
  Gets the total populations of the states in path_list
  """
  pops = []
  for path in path_list: 
    pop = get_state_population(path)
    pops.append(pop)
  return pops

In [272]:
def get_percentage_in_state(path_list, list_tuples):
  """
  Get percentage of the total population of a state that use a certain language
  """ 
  percentages = []
  total_pops = get_top_states_populations(path_list)
  for i in range(len(total_pops)): 
    total_state = total_pops[i]
    speaker_pop = list_tuples[i][1]
    percentage = speaker_pop / total_state
    percentage *= 100
    percentages.append(percentage)
    print("{p}% of residents in {state} use this language".format(p = str(percentage), state = path_list[i]))
  return percentages

## English level

In [268]:
def get_eng(path_list, lang_name):
  """
  Gets the number of language users who report "speaking English less than 'very well'"
  """
  result = []
  for path in path_list: 
    state_list = to_list(path)
    lang_list = get_lang_list(lang_name, state_list)
    eng = lang_list[3]
    result.append((path, eng))
  return result

In [269]:
def get_eng_percentage(speakers, engs): 
  """
  Takes the percentage of the total number of speakers in the state
  """
  percentages = []
  for i in range(len(engs)):
    num_speakers = int(speakers[i][1])
    num_eng = int(engs[i][1])
    percentage = num_eng / num_speakers
    percentage *= 100
    percentages.append((engs[i][0], percentage))
  return percentages

In [270]:
nepali_eng_levels = get_eng_percentage(nepali, get_eng(nepali_states, "Nepali"))
navajo_eng_levels = get_eng_percentage(navajo, get_eng(navajo_states, "Navajo"))
yiddish_eng_levels = get_eng_percentage(yiddish, get_eng(yiddish_states, "Yiddish"))