# JSON Errors Analysis
In this notebook we analyze which files are correctly formatted, comparing the periods and languages.
- Correct files are in .json
- Some files are not json, so they return error during the opening of the json (function json.loads())
- Some files retrieve some Twitter errors. json.loads() works, but they return some number such as, 420, 400. To find these. So I consider correct only the files that are loadable and have the entry "created_at"

In [7]:
import pandas as pd
from glob import glob
import gzip
import json

In [44]:
#folder with all data divided by language
files = sorted(glob("/data/fast/public/collections/multilang/DATA/vaccine/*"))

### Correct File

In [37]:
with gzip.open(files[240], "rt") as f:
    for line in f:
        tweet = json.loads(line)
        break
tweet

{'created_at': 'Mon Oct 14 04:10:06 +0000 2019',
 'id': 1183595812208762880,
 'id_str': '1183595812208762880',
 'text': '@GurkaynakGonenc @DrMkoksal Grip aşısı ne yazık ki, küçük bir kaç grup için ücretsiz.',
 'display_text_range': [28, 85],
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'in_reply_to_status_id': 1183499690932948992,
 'in_reply_to_status_id_str': '1183499690932948992',
 'in_reply_to_user_id': 2827010891,
 'in_reply_to_user_id_str': '2827010891',
 'in_reply_to_screen_name': 'GurkaynakGonenc',
 'user': {'id': 1850791254,
  'id_str': '1850791254',
  'name': 'Beyhan Bulgurlu',
  'screen_name': 'beyhangoksan',
  'location': None,
  'url': None,
  'description': None,
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 173,
  'friends_count': 375,
  'listed_count': 2,
  'favourites_count': 6183,
  'statuses_count': 12066,
  'created_at': 'Tue Sep 10 08:29:51 +0000 

In [38]:
tweet["created_at"]

'Mon Oct 14 04:10:06 +0000 2019'

### Error with json.loads()

In [42]:
with gzip.open(files[2000], "rt") as f:
    for line in f:
        tweet = json.loads(line)
        break
tweet

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [43]:
line

'Traceback (most recent call last):\n'

### Twitter Error

In [40]:
with gzip.open(files[2540], "rt") as f:
    for line in f:
        tweet = json.loads(line)
        break
tweet

420

In [41]:
tweet["created_at"]

TypeError: 'int' object is not subscriptable

### One File Example
Let us count the number of json lines and errors for one file

In [75]:
k = 2540

json_files, json_errors, twitter_errors = 0, 0, 0
file = files[k]
lang = file.split("_")[1].split(".")[0].split("-")
day = file.split("/")[-1].split("_")[0]
month = day[:6]

In [76]:
with gzip.open(files[k], "rt") as f:
    for line in f:
        try:
            tweet = json.loads(line)
            try:
                u = tweet["created_at"]
                json_files += 1
            except:
                twitter_errors += 1
        except:
            json_errors += 1


In [78]:
pd.Series([lang, day, month, json_files, json_errors, twitter_errors],
          index = ["lang", "day", "month", "json_lines", "json_errors", "twitter_errors"]
         )

lang                  [hu]
day               20200717
month               202007
json_lines               5
json_errors              0
twitter_errors         133
dtype: object

### All Files 

In [None]:
"""count_lines = []

for file in files:
    json_files, json_errors, twitter_errors = 0,0,0
    lang = file.split("_")[1].split(".")[0].split("-")
    day = file.split("/")[-1].split("_")[0]
    month = day[:6]
    with gzip.open(file, "rt") as f:
        for line in f:
            try:
                tweet = json.loads(line)
                try:
                    u = tweet["created_at"]
                    json_files += 1
                except:
                    twitter_errors += 1
            except:
                json_errors += 1
    count_lines.append([lang, day, month, json_files, json_errors, twitter_errors])
"""

In [None]:
count_lines = []

for file in files:
    json_files, json_errors, twitter_errors = 0,0,0
    lang = file.split("_")[1].split(".")[0].split("-")
    day = file.split("/")[-1].split("_")[0]
    month = day[:6]
    with gzip.open(file, "rt") as f:
        try:
            for line in f:
                try:
                    tweet = json.loads(line)
                    try:
                        u = tweet["created_at"]
                        json_files += 1
                    except:
                        twitter_errors += 1
                except:
                    json_errors += 1
        except:
            json_errors += 1
                
    count_lines.append([lang, day, month, json_files, json_errors, twitter_errors])


In [113]:
df = pd.DataFrame(count_lines, columns = ["lang", "day", "month", "json_lines", "json_errors", "twitter_errors"])

In [136]:
df

Unnamed: 0,lang,day,month,json_lines,json_errors,twitter_errors
0,"[bg, pl, pt, fr]",20190904,201909,7283,0,0
1,"[bg, fi, el, cs]",20190905,201909,29,0,0
2,"[bg, pl, pt, fr]",20190905,201909,329,0,1
3,[en],20190905,201909,53312,0,0
4,[es],20190905,201909,16773,0,0
...,...,...,...,...,...,...
16316,[ro],20220606,202206,71,0,0
16317,[ru],20220606,202206,415,0,0
16318,[sk],20220606,202206,0,0,0
16319,[sv],20220606,202206,285,0,0


In [None]:
#df.to_csv("/home/jlenti/Files/count_lines_and_errors.csv")