In [6]:
# import packages
import os
import re
import langid

In [7]:
files_names = [f for f in os.listdir('.') if re.match(r'2020-\d{2}-\d{2}_\d+\.txt', f)]
files_names

['2020-04-18_30.txt',
 '2020-04-18_8.txt',
 '2020-05-07_3830.txt',
 '2020-05-07_3831.txt',
 '2020-05-07_3832.txt',
 '2020-05-07_3833.txt',
 '2020-05-07_3834.txt',
 '2020-05-12_1.txt',
 '2020-05-12_26.txt',
 '2020-06-07_3831.txt',
 '2020-06-07_3832.txt',
 '2020-06-07_3833.txt',
 '2020-06-07_3834.txt']

## Put everything into dict

In [11]:
# dictionary to store all the extracted information
output_dict = {}
id_set = set([])

In [12]:
# function to transform ascii string into unicode character
cp2chr = lambda c: (b'\\u' + c.encode('ascii')).decode('raw_unicode_escape') 

# for every txt file
for name in range(len(files_names)):
    f = open(files_names[name],'r',encoding="utf-8")
    
    # extract date from file name
    date = re.findall('2020-\d{2}-\d{2}', files_names[name])[0]
    # extract batch no. from file name
    batch_no = int(re.findall(r'_\d+', files_names[name])[0][1:])
    
    # add date as a key in dict
    # add batch no. as a key under date sub-dict
    if output_dict.get(date) is not None:
        output_dict[date][batch_no] = {}
    else:
        output_dict[date] = {}
        output_dict[date][batch_no] = {}
    
    # read the file as one big string
    file_string = f.read()
    
    # make adjustment to match the sample output format
    file_string = file_string.replace('&', '&amp;').replace('\\"', "&quot;").replace('\\n','\n').replace("\'", '&apos;')
    
    # find all individual tweet
    twit_list= re.findall(r'({"created_at[\w\W]*?}|{"text[\w\W]*?}|{"id[\w\W]*?})', file_string)
    
    # list to store tweet ids
    id_list = []
    
    # append all ids found into id_list
    for t in range(len(twit_list)):
        id_list.append(re.findall(r'"id":"(\d+)"', twit_list[t])[0])
    
    # list to store tweet content
    content_list = []
    
    # append all tweets found into list
    for i in range(len(twit_list)):
        content = re.findall(r'"text":"(?:[\w\W])*?"',twit_list[i])[0][8:-1]
        content_list.append(content)
    
    # find all unicode characters in the form \uXXXX
    for i in range(len(content_list)):
        unit_code_char = re.findall(r'\\\w{5}',content_list[i])
        
        # transform all the unicode character string into actual unicode character
        for char in unit_code_char:
            content_list[i] = content_list[i].replace(char, cp2chr(char[2:]))
        
        # Here is the trick, each emoji is a pair of unicode character \uXXXX\uXXXX, which is encode using utf-16,
        # However, our output file only supports utf-8 encoding, therefore it will throw an error when it encounters
        # an utf-8 character e.g. \uXXXX, which has no meaning , emoji must work in pairs! Since all the other characters
        # of the tweets are utf-8 characters except the emojis, we can try to find all the emojis by trying to encode all the
        # characters into utf-8, when we encounter a character like \uXXXX, it will throw an error, in this case, we will try
        # to encode the current character together with the next character in utf-16, then encode this character in utf-8 and
        # decode it in utf-8, in this manner, we can transform all the characters in the tweets into utf-8 format.
        # since string is immutable, I create a new empty string, and add each succesive characters to the new string by using
        # the following while loop
        
        new_content = ''

        j = 0
        while j < len(content_list[i]):
            try:
                new_content = new_content + content_list[i][j].encode('utf-8').decode('utf-8')
                j+=1
            except:
                new_content = new_content + (content_list[i][j]+content_list[i][j+1]).encode('utf-16', 'surrogatepass').decode('utf-16').encode('utf-8').decode('utf-8')
                j+=2
        content_list[i] = new_content        
    
    # after all the transformation is completed, we can then remove all the non-English tweets by using the langid package,
    # please be aware that this must be done last, because if we haven't change all the unicode characters back to proper form,
    # when python encounter something like \uA1B2, since all this characters are English characters, it will treat emojis as English,
    # if a single tweet only consists of emoji, langid will treat that tweet as English tweets, which is incorrect.
    for i in range(len(content_list)-1,-1,-1):
        language = langid.classify(content_list[i])[0]
        # if language is not english, remove corresponding item from id list and content list, you must do this be iterating
        # the list backwards 
        if language != 'en':
            content_list.pop(i)
            id_list.pop(i)
    
    # remove duplicate
    for j in range(len(id_list)-1,-1,-1):
        if id_list[j] in id_set:
            id_list.pop(j)
            content_list.pop(j)
        else:
            id_set.add(id_list[j])
            
    # put the index and content into the outout dict
    output_dict[date][batch_no]['ind'] = id_list
    output_dict[date][batch_no]['content'] = content_list

In [13]:
output_dict

{'2020-04-18': {30: {'ind': ['1251362171998498816',
    '1251362173705752576',
    '1251362173756100608',
    '1251362173793771520',
    '1251362176696086530',
    '1251362177774018562',
    '1251362178373873664',
    '1251362179070058496',
    '1251362181343576065',
    '1251362181943263234',
    '1251362184556212231',
    '1251362185827307520',
    '1251362185902579713',
    '1251362186154344448',
    '1251362186401849351',
    '1251362186632531968',
    '1251362186657771525',
    '1251362186771005446',
    '1251362186888450049',
    '1251362189933494273',
    '1251362190155747328',
    '1251362190621241344',
    '1251362190629597184',
    '1251362193381175296',
    '1251362193641074688',
    '1251362193708404737',
    '1251362194111057922',
    '1251362194278809600',
    '1251362194538799112',
    '1251362194660323330',
    '1251362195360739328',
    '1251362195423637508',
    '1251362195818020870',
    '1251362199278321666',
    '1251362199509049344',
    '1251362199723008000',
   

## Output to XML

In [14]:
# sort the date
date = sorted(output_dict.keys())
date

['2020-04-18', '2020-05-07', '2020-05-12', '2020-06-07']

In [15]:
batch_no = []
# sort the batch no for each date
for i in date:
    b = sorted(output_dict[i].keys())
    batch_no.append(b)

batch_no

[[8, 30], [3830, 3831, 3832, 3833, 3834], [1, 26], [3831, 3832, 3833, 3834]]

In [16]:
# write all the output to an xml file
f = open('output.xml', 'w',encoding="utf-8")

f.write('<?xml version="1.0" encoding="UTF-8"?>\n')
f.write('<data>\n')

for d in range(len(date)):
    f.write('<tweets date="' + str(date[d])+'">\n')
    for b in range(len(batch_no[d])):
        for t in range(len(output_dict[date[d]][batch_no[d][b]]['ind'])):
            f.write('<tweet id="' + output_dict[date[d]][batch_no[d][b]]['ind'][t] + '">' + \
                    output_dict[date[d]][batch_no[d][b]]['content'][t] + '</tweet>\n')
    f.write('</tweets>\n')
    
f.write('</data>')
f.close()