<a href="https://colab.research.google.com/github/he-yilan/language-speaker-populations/blob/main/States.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Census data study of linguistic diversity

In [40]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
from google.colab import files
import pandas as pd
import os
import csv
import re

In [42]:
os.chdir('/content/drive/MyDrive/Ling55AC/assignment2/states') # the path where you store your csv files for all states

## Population of speakers of a certain language in each state

In [43]:
state_files = os.listdir()

In [44]:
def to_list(this_path):
  """
  Compiles all data from CSV file into a list of lists. 
  Each list is a row from the original CSV. 
  """
  lines = list()
  remove= [1,2,3,4]

  with open(this_path, 'r') as read_file:
      reader = csv.reader(read_file)
      for row_number, row in enumerate(reader, start=1):
          if(row_number not in remove):
              lines.append(row)

  with open('new_csv.csv', 'w') as write_file:
      writer = csv.writer(write_file)
      writer.writerows(lines)
  return lines

In [45]:
def get_lang_list(lang_name, lang_matrix):
  """
  Extracts the list from the matrix that holds the data about the language we are interested in. 
  """
  for lang_list in lang_matrix:
    if re.search(".*" + lang_name, lang_list[0]) is not None: 
      return lang_list

In [46]:
def get_all_states_speakers(lang_name):
  """
  Iterates through all the states and finds the population of lang_name speakers in that state. 
  Returns a list of tuples, matching the state to its population of speakers. 
  """
  result = []
  for file in state_files: 
    state_list = to_list(file)
    lang_list = get_lang_list(lang_name, state_list)
    if lang_list is None: 
        num_speakers = 0
    elif lang_list[1] != '(D)':
      num_speakers = int(lang_list[1])
    else:
      num_speakers = 0
    result.append((file, num_speakers))
  return result

In [47]:
def sorted_states(lang_name):
  """
  Gets the populations of lang_name speakers in each state, 
  compiles them into a list, and then sorts the list in descending. 
  """
  lang_states = get_all_states_speakers(lang_name)
  return sorted(lang_states, key=lambda x: x[1], reverse = True)

In [48]:
nepali = sorted_states("Nepali")
navajo = sorted_states("Navajo")
yiddish = sorted_states("Yiddish")

In [98]:
nepali

[('Texas.csv', 11735),
 ('New York.csv', 9015),
 ('California.csv', 7165),
 ('Virginia.csv', 6305),
 ('Massachusetts.csv', 5415),
 ('Pennsylvania.csv', 4720),
 ('Maryland.csv', 4240),
 ('Colorado.csv', 3615),
 ('Georgia.csv', 3345),
 ('Illinois.csv', 3135),
 ('Michigan.csv', 2405),
 ('Washington.csv', 2385),
 ('North Carolina.csv', 2260),
 ('Ohio.csv', 2130),
 ('New Hampshire.csv', 1990),
 ('Utah.csv', 1985),
 ('Kentucky.csv', 1855),
 ('Minnesota.csv', 1675),
 ('Florida.csv', 1525),
 ('Connecticut.csv', 1325),
 ('Missouri.csv', 1290),
 ('Tennessee.csv', 1185),
 ('Nebraska.csv', 1110),
 ('Arizona.csv', 1085),
 ('Oregon.csv', 1045),
 ('Nevada.csv', 1030),
 ('Idaho.csv', 890),
 ('New Jersey.csv', 805),
 ('Wisconsin.csv', 795),
 ('Vermont.csv', 790),
 ('Iowa.csv', 720),
 ('Kansas.csv', 685),
 ('Rhode Island.csv', 575),
 ('Alabama.csv', 555),
 ('South Dakota.csv', 480),
 ('Louisiana.csv', 340),
 ('Arkansas.csv', 320),
 ('West Virginia.csv', 305),
 ('Alaska.csv', 300),
 ('South Carolina.csv'

In [69]:
lang_names = ["Nepali", "Navajo", "Yiddish"]

In [70]:
lang_state_lists = [nepali, navajo, yiddish]

## Percentage of national population

In [55]:
os.chdir('/content/drive/MyDrive/Ling55AC/assignment2') # the path where you store your csv files for national data

In [60]:
us_data = to_list("US.csv")

In [118]:
def get_national_speakers(lang_name):
  """
  Gets the total number of people in the US who speak a certain language
  """
  us_pop = get_lang_list(lang_name, us_data)
  for i in range(len(us_pop)): 
    us_pop[i] = us_pop[i].replace(",", "")
  return int(us_pop[1])

In [119]:
def get_state_percentage(lang_name, sorted_state_list, index): 
  """
  Gets the percentage of speakers in the US who resides in a certain state (index). 
  """
  num_speakers_us = get_national_speakers(lang_name)
  # print("national speakers:", num_speakers_us)
  # print('state:', sorted_state_list[index][0])
  num_speakers_state = sorted_state_list[index][1]
  # print('i:', index)
  # print("speakers in top i state:", num_speakers_state)
  return num_speakers_state / num_speakers_us

In [123]:
def get_top_percentages(): 
  """
  Gets the percentage of speakers residing in the top 3 states. 
  Returns the top 3 states for each language. 
  """
  result = []
  for i in range(3): 
    top = []
    l = lang_names[i]
    lang_state_list = lang_state_lists[i]
    print(l)
    top.append(l)
    top.extend([lang_state_list[0][0], lang_state_list[1][0], lang_state_list[2][0]])
    for j in range(3): 
      percentage = get_state_percentage(l, lang_state_list, j)
      percentage = percentage * 100
      print("{p}% , of speakers reside in the top {rank} state".format(p = str(percentage), rank = j+1))
    result.append(top)
    print()
  return result

In [124]:
tops = get_top_percentages()

Nepali
12.454892804075568% , of speakers reside in the top 1 state
9.568032264911908% , of speakers reside in the top 2 state
7.604542559966037% , of speakers reside in the top 3 state

Navajo
50.227182813230556% , of speakers reside in the top 1 state
38.38730173953701% , of speakers reside in the top 2 state
4.911104983635645% , of speakers reside in the top 3 state

Yiddish
82.76021647748453% , of speakers reside in the top 1 state
4.2337802573562495% , of speakers reside in the top 2 state
3.718296461030196% , of speakers reside in the top 3 state



In [126]:
tops

[['Nepali', 'Texas.csv', 'New York.csv', 'California.csv'],
 ['Navajo', 'Arizona.csv', 'New Mexico.csv', 'Utah.csv'],
 ['Yiddish', 'New York.csv', 'New Jersey.csv', 'Florida.csv']]

## Percentage of a state total population

In [133]:
os.chdir('/content/drive/MyDrive/Ling55AC/assignment2/states') # the path where you store your csv files for all states

In [177]:
nepali_states = tops[0][1:]
navajo_states = tops[1][1:]
yiddish_states = tops[2][1:]

In [178]:
def get_state_population(state_path):
  """
  Gets the total population of a state
  """
  state_list = to_list(state_path)
  for list in state_list: 
    if re.search(".*" + 'Population 5 years and over', list[0]) is not None: 
      return int(list[1])

In [179]:
def get_top_states_populations(path_list): 
  """
  Gets the total populations of the states in path_list
  """
  pops = []
  # print(path_list)
  for path in path_list: 
    # print(path)
    pop = get_state_population(path)
    pops.append(pop)
    # print('total population:', pop)
  return pops

In [217]:
def get_percentage_in_state(path_list, list_tuples): 
  percentages = []
  total_pops = get_top_states_populations(path_list)
  for i in range(len(total_pops)): 
    total_state = total_pops[i]
    speaker_pop = list_tuples[i][1]
    percentage = speaker_pop / total_state
    percentage *= 100
    percentages.append(percentage)
    print("{p}% of residents in {state} use this language".format(p = str(percentage), state = path_list[i]))
  return percentages

In [218]:
get_percentage_in_state(nepali_states, nepali)

0.049505577023674925% of residents in Texas.csv use this language
0.04920186551206288% of residents in New York.csv use this language
0.02039484360286056% of residents in California.csv use this language


[0.049505577023674925, 0.04920186551206288, 0.02039484360286056]

In [219]:
get_percentage_in_state(navajo_states, navajo)

1.3888605730972576% of residents in Arizona.csv use this language
3.321278744179075% of residents in New Mexico.csv use this language
0.3207493468935702% of residents in Utah.csv use this language


[1.3888605730972576, 3.321278744179075, 0.3207493468935702]

In [220]:
get_percentage_in_state(yiddish_states, yiddish)

0.7027434501756203% of residents in New York.csv use this language
0.07941407913847479% of residents in New Jersey.csv use this language
0.03211239259695278% of residents in Florida.csv use this language


[0.7027434501756203, 0.07941407913847479, 0.03211239259695278]

## English level

In [235]:
def get_eng(path_list, lang_name):
  """
  Gets the number of language users who report "speaking English less than 'very well'"
  """
  result = []
  for path in path_list: 
    state_list = to_list(path)
    lang_list = get_lang_list(lang_name, state_list)
    eng = lang_list[3]
    result.append((path, eng))
  return result

In [236]:
def get_eng_percentage(speakers, engs): 
  """
  Takes the percentage of the total number of speakers in the state
  """
  percentages = []
  for i in range(len(engs)):
    num_speakers = int(speakers[i][1])
    num_eng = int(engs[i][1])
    percentage = num_eng / num_speakers
    percentage *= 100
    percentages.append((engs[i][0], percentage))
  return percentages

In [237]:
nepali_eng_levels = get_eng_percentage(nepali, get_eng(nepali_states, "Nepali"))
nepali_eng_levels

[('Texas.csv', 55.30464422667235),
 ('New York.csv', 63.78258458125347),
 ('California.csv', 54.78018143754362)]

In [238]:
navajo_eng_levels = get_eng_percentage(navajo, get_eng(navajo_states, "Navajo"))
navajo_eng_levels

[('Arizona.csv', 25.038189803322513),
 ('New Mexico.csv', 18.080886945658964),
 ('Utah.csv', 15.720737214695472)]

In [239]:
yiddish_eng_levels = get_eng_percentage(yiddish, get_eng(yiddish_states, "Yiddish"))
yiddish_eng_levels

[('New York.csv', 38.74417520969245),
 ('New Jersey.csv', 7.4692576286625165),
 ('Florida.csv', 14.312878133102853)]