## Indexing Tweets
* introducing custom analyzers in the settings segments of ES index
* prepare settings and mappings for the index
* (bulk) inserting tweets using elasticsearch api

In [41]:
# This command is used to install elasticsearch library.
# Use it once and comment it out again
#!pip install elasticsearch


from elasticsearch import Elasticsearch

In [42]:
es = Elasticsearch()

#name of the created index
index_name = "tweets"

#### Custom analyzers in ES
In ES, the textual elements of docuements can be analyzed before being indexed. This analysis process can be done by the default "standard" analyzer, or developer can build his own custom analyzer in the setting segment.
To have a better idea about what analyzer can do with text, try out the following analyzer using Kibana Dev. tools.

__Example-1__
```json
GET /_analyze 
{
  "tokenizer": "whitespace",
  "filter": ["lowercase", "stop"],
  "char_filter": ["html_strip"],
  "text": "text to be analyzed. It contains <html></html>"
}
```
__Example-2__
```json
GET /_analyze 
{
  "tokenizer": "standard",
          "filter": [
            "lowercase",
             {
              "type": "ngram",
              "min_gram": 3,
              "max_gram": 4
            }
          ],
  "char_filter": ["html_strip"],
  "text": "text to be analyzed. It contains <html></html>"
}
```

In [43]:
# Create index with settings and mapping

# This test is done during development only. 
if es.indices.exists(index_name):
    es.indices.delete(index=index_name)
    

# index settings
settings = {
  "settings": {
    "analysis": {
      "filter": {
        "trigrams_filter": {
          "type": "ngram",
          "min_gram": 3,
          "max_gram": 4
        }
      },
      "analyzer": {
        "text_processing": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "trigrams_filter"
          ]
        }
      }
    }
  }
,"mappings": {
        "properties": {
          "date": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "flag": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "id": {
          "type": "keyword",
            "ignore_above": 256
        },
        "target": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        },
        "text": {
          "type": "text"
        },
        "user": {
          "type": "text",
          "fields": {
            "keyword": {
              "type": "keyword",
              "ignore_above": 256
            }
          }
        }
        }

}

    }
# create index
es.indices.create(index=index_name, ignore=400, body=settings)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'tweets'}

## Data source:
https://www.kaggle.com/kazanova/sentiment140/data#

In [44]:
# inserting records

# target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
# ids: The id of the tweet ( 2087)
# date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)
# flag: The query (lyx). If there is no query, then this value is NO_QUERY.
# user: the user that tweeted (robotickilldozr)
# text: the text of the tweet (Lyx is cool)
    

tweet = {
  "target": "4",
  "id": "2193602064",
  "date": "Tue Jun 16 08:40:49 PDT 2009",
  "flag": "NO_QUERY",
  "user": "tinydiamondz",
  "text": "Happy 38th Birthday to my boo of alll time!!! Tupac Amaru Shakur"
}

res = es.index(index=index_name, id=tweet['id'], body=tweet)



print(res)

# Now check http://localhost:9200/tweets/_mappings

{'_index': 'tweets', '_type': '_doc', '_id': '2193602064', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


In [45]:
from elasticsearch import helpers
import gzip
import json

i = 1
actions = []
with gzip.open('training.1600000.processed.noemoticon.csv.gz','rt',encoding='latin-1') as f:
    #print(i, len(actions))
    for line in f:
        if i%10000!=0:
            #print('got line', i)
            line = line.replace("\"", "")
            line = line.split(",")
            tweet = {
              "target": line[0],
              "id": line[1],
              "date": line[2],
              "flag": line[3],
              "user": line[4],
              "text": line[5]
            }
            actions.append(tweet)
        else:
            try:
                line = line.replace("\"", "")
                line = line.split(",")
                tweet = {
                  "target": line[0],
                  "id": line[1],
                  "date": line[2],
                  "flag": line[3],
                  "user": line[4],
                  "text": line[5]
                }
                actions.append(tweet)
                helpers.bulk(es, actions,index=index_name)
                actions = []
            except:
                None
        i=i+1
        
        
        
# Practice:
    #1) download the twitter file and compress it using gzip command
    #2) read the tweets oe by one using the above code, then complete it towards performing the follwoing:
        # a) create a json tweet and add it to actions list
        # b) bulk insert each 10000 into "tweets" index
    #3) [otional] repeate all the above steps using 'scala'

