In [1]:
# Importing the required libraries.
import math, statistics
from nltk.corpus import stopwords
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import os
from IPython.display import clear_output
import pickle
import zlib

In [2]:
# Load the data to map the 2011 district_ids to their corresponding 2001 district_ids.
file = open('district_mapping.csv')
mapping = file.read()[3:].split('\n')[:-1]
file.close()

# Forming a dictionary to map the 2011 district_ids to their corresponding 2001 district_ids.
temp = {}
for i in mapping:
    i = i.split(',')
    temp[int(i[0])] = int(i[1])
mapping = temp

# Load the data to map the 2011 labels of districts.
file = open('labels_2011.csv')
labels = file.read()[3:].split('\n')[:-1]
file.close()

# Forming a dictionary to map the 2011 district_ids to their corresponding labels.
temp = {}
for i in labels:
    i = i.split(',')
    temp[int(i[0])] = i[1]
labels = temp

# Load the data to map the 2019 change predictions of districts.
file = open('change_predictions_2019.csv')
pace = file.read().split('\n')[1:-1]
file.close()

# Forming a dictionary to map the 2011 district_ids to their predicted pace of growth.
temp = {}
for i in pace:
    i = i.split(',')
    if int(mapping[int(i[0])]) in temp:
        temp[int(mapping[int(i[0])])].append(int(i[1]))
    else:
        temp[int(mapping[int(i[0])])] = [int(i[1])]
for i in temp:
    temp[i] = math.ceil(statistics.mean(temp[i]))
    if temp[i]<2:
        temp[i]='Slow'
    elif temp[i]==2:
        temp[i]='Average'
    else:
        temp[i]='Fast'
pace = temp

# Load the data to map the 2011 labels of districts.
file = open('industry_type.csv')
industry = file.read().split('\n')[1:-1]
file.close()

# Forming a dictionary to map the 2011 district_ids to their industrial type.
temp={}
for i in industry:
    i = i.split(',')
    temp[int(i[0])] = 'Type-'+i[1]
industry = temp

In [3]:
# collecting all the entities, for entities blinding.
file = open('locations.csv')
data = file.read().split('\n')[1:]
file.close()

locations = set()
for i in data:
    i = i.split(',')[1].split()
    locations |= set(i)
    
# function to remove stopwords, perform stemming and entity blinding.
def filter_text(text):
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub("\S*\d\S*", "", text)
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    text = text.lower()
    text = word_tokenize(text)
    text = [w for w in text if not w in stop_words|locations]
    text = [ps.stem(w) for w in text]
    return text

In [4]:
# function to create the required data.
def create_dataset(collection_name):
    
    print(collection_name.capitalize())
    path = './Collections/'+collection_name.capitalize()+' Data/'
    
    file1 = open(path + 'file1','rb')
    file2 = open(path + 'file2','rb')
    data = pickle.loads(zlib.decompress(pickle.load(file1)))
    text = pickle.loads(zlib.decompress(pickle.load(file2)))
    file1.close()
    file2.close()
    
    temp = {}
    for i in text:
        temp[i[0]] = [i[1], i[2], i[3]]
    text = temp
        
    district_data = []
    states_data = []
    processed = 0
    
    for i in data:
        
        i[1] = int(i[1])
        if i[1]<900 and i[0] in text:
            i[1] = mapping[i[1]]
            district_data.append([i[0], text[i[0]][0], text[i[0]][1], filter_text(text[i[0]][0]+' '+text[i[0]][1]), \
             text[i[0]][2], mapping[i[1]], labels[i[1]], pace[i[1]], industry[i[1]]])
        elif i[1]>=900 and i[0] in text:
            states_data.append([i[0], text[i[0]][0], text[i[0]][1], filter_text(text[i[0]][0]+' '+text[i[0]][1]), \
             text[i[0]][2]])   
        
        processed+=1
        print(collection_name.capitalize(), processed,'/',len(data))
        clear_output(wait=True)
    
    print('Saving Datasets.')
    
    file = open('./DT2V_Datasets/dataset_'+collection_name,'wb')
    pickle.dump(zlib.compress(pickle.dumps(district_data),pickle.HIGHEST_PROTOCOL),file,pickle.HIGHEST_PROTOCOL)
    file.close()
    
    file = open('./DT2V_Datasets/states_dataset_'+collection_name,'wb')
    pickle.dump(zlib.compress(pickle.dumps(states_data),pickle.HIGHEST_PROTOCOL),file,pickle.HIGHEST_PROTOCOL)
    file.close()

In [5]:
# os.mkdir('./DT2V_Dataset/')
for collection in ['agriculture','development','environment','industrialization','environment']:
    create_dataset(collection)
print('Done.')

Saving Datasets.
Done.
