# Get our environment setup

In [2]:
import pandas as pd
import numpy as np

import chardet

np.random.seed(0)

# What are encodings

In [3]:
before = "This is the euro symbol: €"

type(before)

str

In [4]:
after = before.encode("utf-8", errors = "replace")

type(after)

bytes

In [5]:
after

b'This is the euro symbol: \xe2\x82\xac'

In [6]:
print(after.decode("utf-8"))

This is the euro symbol: €


In [7]:
print(after.decode("ascii"))

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 25: ordinal not in range(128)

In [10]:
before = "This is the euro symbol: €"

after = before.encode("ascii", errors="replace")

print(after.decode("ascii"))

# print(after.decode("utf-8"))

This is the euro symbol: ?


In [13]:
print(after.decode("utf-8"))

This is the euro symbol: ?


### Your turn

In [14]:
before = "This is my symbol: $"
print(before)

after = before.encode("ascii", errors="replace")
print(after)

print(after.decode("ascii"))

This is my symbol: $
b'This is my symbol: $'
This is my symbol: $


In [15]:
before = "This is my symbol: #"
print(before)

after = before.encode("ascii", errors="replace")
print(after)

print(after.decode("ascii"))

This is my symbol: #
b'This is my symbol: #'
This is my symbol: #


In [16]:
before = "This is my symbol: 你好"
print(before)

after = before.encode("ascii", errors="replace")
print(after)

print(after.decode("ascii"))

This is my symbol: 你好
b'This is my symbol: ??'
This is my symbol: ??


In [21]:
before = "This is my symbol: 你好"
print(before)

after = before.encode("ascii", errors="replace")
print(after)

print(after.decode("ascii"))

This is my symbol: 你好
b'This is my symbol: ??'
This is my symbol: ??


In [23]:
before = "This is my symbol: नमस्ते"
print(before)

after = before.encode("ascii", errors="replace")
print(after)

print(after.decode("ascii"))

This is my symbol: नमस्ते
b'This is my symbol: ??????'
This is my symbol: ??????


In [24]:
before = "This is my symbol: नमस्ते"
print(before)

after = before.encode("utf-8", errors="replace")
print(after)

print(after.decode("utf-8"))

This is my symbol: नमस्ते
b'This is my symbol: \xe0\xa4\xa8\xe0\xa4\xae\xe0\xa4\xb8\xe0\xa5\x8d\xe0\xa4\xa4\xe0\xa5\x87'
This is my symbol: नमस्ते


# Reading in files with encoding problems

In [25]:
kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/ks-projects-201612.csv")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 11: invalid start byte

In [27]:
with open("../input/kickstarter-projects/ks-projects-201612.csv", "rb") as rawdata:
    result = chardet.detect(rawdata.read(10000))
    
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [30]:
kickstarter_2016 = pd.read_csv("../input/kickstarter-projects/ks-projects-201612.csv", encoding="Windows-1252")

kickstarter_2016.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


### Your turn

In [33]:
police_killings = pd.read_csv("../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 2: invalid start byte

In [48]:
with open("../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv", "rb") as rawdata:
    result = chardet.detect(rawdata.read(28000))
    
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [49]:
police_killings = pd.read_csv("../input/fatal-police-shootings-in-the-us/PoliceKillingsUS.csv", encoding='Windows-1252')

police_killings.head()

Unnamed: 0,id,name,date,manner_of_death,armed,age,gender,race,city,state,signs_of_mental_illness,threat_level,flee,body_camera
0,3,Tim Elliot,02/01/15,shot,gun,53.0,M,A,Shelton,WA,True,attack,Not fleeing,False
1,4,Lewis Lee Lembke,02/01/15,shot,gun,47.0,M,W,Aloha,OR,False,attack,Not fleeing,False
2,5,John Paul Quintero,03/01/15,shot and Tasered,unarmed,23.0,M,H,Wichita,KS,False,other,Not fleeing,False
3,8,Matthew Hoffman,04/01/15,shot,toy weapon,32.0,M,W,San Francisco,CA,True,attack,Not fleeing,False
4,9,Michael Rodriguez,04/01/15,shot,nail gun,39.0,M,H,Evans,CO,False,attack,Not fleeing,False


# Saving your files with UTF-8 encoding

In [53]:
kickstarter_2016.to_csv("../output/ks-projects-201801-utf8.csv")

In [52]:
police_killings.to_csv("../output/police-killings-utf8.csv")

# More Practice

### die_ISO-8859-1.txt

In [97]:
die_ISO = open("../input/more practice/die_ISO-8859-1.txt", "r")
die_ISO.read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xfc in position 36: invalid start byte

In [60]:
with open("../input/more practice/die_ISO-8859-1.txt", "rb") as rawdata:
    result = chardet.detect(rawdata.read(1000))
    
print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


### harpers_ASCII.txt

In [99]:
die_ISO = open("../input/more practice/harpers_ASCII.txt", "r")
die_ISO.read()



In [61]:
with open("../input/more practice/harpers_ASCII.txt", "rb") as rawdata:
    result = chardet.detect(rawdata.read(1000))
    
print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


### olaf_Windows-1251.txt

In [100]:
die_ISO = open("../input/more practice/olaf_Windows-1251.txt", "r")
die_ISO.read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xce in position 1494: invalid continuation byte

In [69]:
with open("../input/more practice/olaf_Windows-1251.txt", "rb") as rawdata:
    result = chardet.detect(rawdata.read(2000))
    
print(result)

{'encoding': 'windows-1251', 'confidence': 0.9664223608851299, 'language': 'Bulgarian'}


### portugal_ISO-8859-1.txt

In [101]:
die_ISO = open("../input/more practice/portugal_ISO-8859-1.txt", "r")
die_ISO.read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 95: invalid continuation byte

In [70]:
with open("../input/more practice/portugal_ISO-8859-1.txt", "rb") as rawdata:
    result = chardet.detect(rawdata.read(1000))
    
print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


### shisei_UTF-8.txt

In [102]:
die_ISO = open("../input/more practice/shisei_UTF-8.txt", "r")
die_ISO.read()

'\ufeffThe Project Gutenberg EBook of Shisei, by Junichiro Tanizaki\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.net\n\n\nTitle: Shisei\n\nAuthor: Junichiro Tanizaki\n\nRelease Date: March 13, 2010 [EBook #31617]\n\nLanguage: Japanese\n\nCharacter set encoding: UTF-8\n\n*** START OF THIS PROJECT GUTENBERG EBOOK SHISEI ***\n\n\n\n\nProduced by Kaoru Tanaka\n\n\n\n\nTitle: 刺靑 (Shisei)\nAuthor: 谷崎潤一郞 (Junichiro Tanizaki)\nLanguage: Japanese\nCharacter set encoding: UTF-16\nText preparation by Kaoru Tanaka\n\n-------------------------------------------------------\nNotes on the signs in the text\n\n《...》 shows ruby (short runs of text alongside the base text to indicate pronunciation).\nEg. 其《そ》\n\n｜ marks the start of a string of ruby-attached characters.\nEg. 十三｜年目《ねんめ》\n\n［＃...］ expla

In [103]:
with open("../input/more practice/shisei_UTF-8.txt", "rb") as rawdata:
    result = chardet.detect(rawdata.read(2000))
    
print(result)

{'encoding': 'UTF-8-SIG', 'confidence': 1.0, 'language': ''}


### yan_BIG-5.txt

In [104]:
die_ISO = open("../input/more practice/yan_BIG-5.txt", "r")
die_ISO.read()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 1387: invalid continuation byte

In [105]:
with open("../input/more practice/yan_BIG-5.txt", "rb") as rawdata:
    result = chardet.detect(rawdata.read(2000))
    
print(result)

{'encoding': 'Big5', 'confidence': 0.99, 'language': 'Chinese'}
