In [2]:
import os
from pymongo import MongoClient
import pymongo
from pymongo.server_api import ServerApi
from urllib.parse import quote_plus
import json

In [3]:
## Preprocess and upload data into MongoDB Server
with open('./data/corona-out-2') as f:
    all_lines_list = f.readlines()
filtered_json_list = [json.loads(line) for line in all_lines_list if line.strip()]

In [4]:
filtered_json_list[0]

{'created_at': 'Sun Apr 12 18:27:25 +0000 2020',
 'id': 1249403767180668930,
 'id_str': '1249403767180668930',
 'text': 'RT @nuffsaidny: wishing death on people is weirdo behavior.',
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'truncated': False,
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 1242817830946508801,
  'id_str': '1242817830946508801',
  'name': 'juwelz v',
  'screen_name': 'juwelz_v',
  'location': 'Lower East Side, Manhattan',
  'url': 'https://linktr.ee/juwelzv',
  'description': 'Event Lyfe LLC .. Brand Ambassador: #visionarysociety Music Manager: @ssizzzlee & @tmoneybeatsbang Production🎥📸 #BringNyCultureBack 🌊🌊🌊 !!',
  'translator_type': 'none',
  'protected': False,
  'verified': False,
  'followers_count': 43,
  'friends_count': 118,
  'listed_count': 0,
  'favourites_count': 722,


In [5]:
client = pymongo.MongoClient("mongodb://localhost:27017/")

In [6]:
## Push raw data into Mongodb

db = client['twitter']
collection = db['RawData']
collection.insert_many(filtered_json_list)
print("Data inserted successfully.")

Data inserted successfully.


In [7]:
## Create Tweet Collection

source_db = client['twitter']
source_collection = source_db['RawData']

target_db = client['twitter']
target_collection = target_db['TweetsData']

# aggregation pipeline 
pipeline = [
    {
        '$project': {
            '_id': 1, 
            'created_at':1, 'id':1, 'text':1, 'source':1, 'truncated':1, 'in_reply_to_status_id':1, 
            'in_reply_to_user_id':1, 'in_reply_to_screen_name':1, 
            # user related info
            'user_id':'$user.id', 'display_name':'$user.name', 'profile_name':'$user.screen_name', 
            'user_is_protected':'$user.protected', 'user_is_verified':'$user.verified', 
            'user_follower_count':'$user.followers_count', 'user_friends_count':'$user.friends_count',
            'user_listed_count':'$user.listed_count', "user_created_at": "$user.created_at",
            # Tweet related info
            'geo':1, "coordinates": 1, "place": 1, "contributors": 1, "is_quote_status": 1,
            "quote_count": 1, "reply_count": 1, "retweet_count": 1, "favorite_count": 1,
            "favorited": 1, "retweeted": 1, "possibly_sensitive": 1, "filter_level": 1, "lang": 1,
            "timestamp_ms": 1,
            # Hashtag related info
            'hashtags':"$entities.hashtags", 'urls':"$entities.urls", "user_mentions":"$entities.user_mentions",
            'symbols':"$entities.symbols", 
        }
    }
]

# Perform the aggregation on the source collection
projected_documents = list(source_collection.aggregate(pipeline))

# If the projected documents are not empty, insert them into the new collection
if projected_documents:
    target_collection.insert_many(projected_documents)
    print(f"{len(projected_documents)} documents inserted into {target_db.name}.{target_collection.name}.")
else:
    print("No documents were projected or found.")


18518 documents inserted into twitter.TweetsData.


In [8]:
# Create Userid-Tweet._id mapping collection
source_db = client['twitter']
source_collection = source_db['TweetsData']

target_db = client['twitter']  # This can be the same as source_db
target_collection = target_db['Userid-Tweets._id']

# Define the aggregation pipeline
pipeline = [
    {
        '$group': {
            '_id': '$user_id',  # Group by userid
            'ids': {'$push': '$_id'}  # Accumulate all _id values into a list named 'ids'
        }
    },
    {
        '$project': {
            'user_id': '$_id',  # Rename _id to userid for clarity
            '_id': 0,  # Suppress the default _id field
            'ids': 1  # Include the accumulated list of ids
        }
    }
]

aggregated_documents = list(source_collection.aggregate(pipeline))
if aggregated_documents:
    transformed_documents = [{'user_id': doc['user_id'], 'ids': doc['ids']} for doc in aggregated_documents]
    target_collection.insert_many(transformed_documents)
    print(f"{len(aggregated_documents)} documents inserted into '{target_collection.full_name}'.")
else:
    print("No documents were aggregated or found.")


15505 documents inserted into 'twitter.Userid-Tweets._id'.


In [9]:
## Create Secondary index for user_id on 'Userid-Tweets._id'

target_db = client['twitter']
target_collection = target_db['Userid-Tweets._id']

index_name = target_collection.create_index([('user_id', 1)])

print(f"Index created: {index_name}")


Index created: user_id_1


In [10]:
# Create Hasthags-Tweet._id mapping collection

source_db = client['twitter']
source_collection = source_db['TweetsData']

target_db = client['twitter']  # This can be the same as source_db
target_collection = target_db['Hashtags-Tweets._id']

pipeline = [
    {'$unwind': '$hashtags'},
    {'$group': {
        '_id': '$hashtags', 
        'ids': {'$push': '$_id'}
    }
    },
    {
        '$project': {
            'hashtag': '$_id',  # Rename _id to userid for clarity
            '_id': 0,  # Suppress the default _id field
            'ids': 1  # Include the accumulated list of ids
        }
    }
    
    
]

aggregated_documents = list(source_collection.aggregate(pipeline))
if aggregated_documents:
    transformed_documents = [{'hashtag': doc['hashtag'], 'ids': doc['ids']} for doc in aggregated_documents]
    target_collection.insert_many(transformed_documents)
    print(f"{len(aggregated_documents)} documents inserted into '{target_collection.full_name}'.")
else:
    print("No documents were aggregated or found.")


2781 documents inserted into 'twitter.Hashtags-Tweets._id'.


In [11]:
## Create Secondary index for hashtag on 'Hashtags-Tweets._id'

target_db = client['twitter']
target_collection = target_db['Hashtags-Tweets._id']

index_name = target_collection.create_index([('hashtag', 1)])

print(f"Index created: {index_name}")


Index created: hashtag_1


In [25]:
## Push data to server Database



Pinged your deployment. You successfully connected to MongoDB!


In [21]:
from pymongo import MongoClient

local_client = MongoClient('localhost', 27017)

# Select the database
db = local_client['DbmsProject']

# Select the collection
collection = db['RawData']

# Set to store unique field names
field_names = set()

# Iterate through each document in the collection
for document in collection.find():
    # Update the set with the document's keys (field names)
    field_names.update(document.keys())

# Convert the set to a list if you need a list format
field_list = list(field_names)

# Print the unique field names
print(field_list)


['in_reply_to_user_id', 'text', 'extended_entities', 'quoted_status_id', 'id_str', 'in_reply_to_status_id_str', 'truncated', 'reply_count', 'in_reply_to_status_id', '_id', 'created_at', 'withheld_in_countries', 'lang', 'coordinates', 'source', 'favorited', 'favorite_count', 'display_text_range', 'in_reply_to_user_id_str', 'retweeted', 'retweet_count', 'possibly_sensitive', 'geo', 'quoted_status', 'timestamp_ms', 'in_reply_to_screen_name', 'id', 'quote_count', 'retweeted_status', 'place', 'user', 'contributors', 'entities', 'is_quote_status', 'extended_tweet', 'quoted_status_permalink', 'quoted_status_id_str', 'filter_level']


In [None]:
_id
text
extended_entities
id_str
truncated
reply_count
created_at
source
withheld_in_countries
lang
coordinates
favorite_count
display_text_range
place
entities
extended_tweet
filter_level

user
contributors

In [None]:
is_quote_status
quoted_status_id
quoted_status
quote_count

retweeted_status
retweeted


quoted_status_permalink
#quoted_status_id_str

In [None]:
in_reply_to_user_id
in_reply_to_status_id_str
in_reply_to_status_id
in_reply_to_user_id_str

In [None]:
[
  {
    "created_at": "Sat Apr 25 12:21:41 +0000 2020",
    "id": 1254022770679320576,
    ## "id_str": "1254022770679320576",
    "text": "É isto, ou vou morrer sem ar ou com o corona https://t.co/O0Y7B3Koj4",
    ##"display_text_range": [
    ##  0,
    ##  44
    ## ],
    "source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
      "id": 804046791348015107,
     ## "id_str": "804046791348015107",
      "name": "Bi Sex Uau",
      "screen_name": "B_King69",
      "location": "Acre, Brasil",
      ## "url": "https://www.instagram.com/?hl=pt-br",
      "description": "se for da minha família já pode voltar daq mesmo",
      ## "translator_type": "none",
      "protected": false,
      "verified": false,
      "followers_count": 89,
      "friends_count": 173,
      "listed_count": 0,
      "favourites_count": 5446,
      "statuses_count": 4728,
      "created_at": "Wed Nov 30 19:37:48 +0000 2016",
      "utc_offset": null,
      "time_zone": null,
      "geo_enabled": false,
      "lang": null,
      "contributors_enabled": false,
      "is_translator": false,
#       "profile_background_color": "F5F8FA",
#       "profile_background_image_url": "",
#       "profile_background_image_url_https": "",
#       "profile_background_tile": false,
#       "profile_link_color": "1DA1F2",
#       "profile_sidebar_border_color": "C0DEED",
#       "profile_sidebar_fill_color": "DDEEF6",
#       "profile_text_color": "333333",
#       "profile_use_background_image": true,
#       "profile_image_url": "http://pbs.twimg.com/profile_images/1210794971231477761/hLtokcV-_normal.jpg",
#       "profile_image_url_https": "https://pbs.twimg.com/profile_images/1210794971231477761/hLtokcV-_normal.jpg",
#       "profile_banner_url": "https://pbs.twimg.com/profile_banners/804046791348015107/1577511053",
#       "default_profile": true,
#       "default_profile_image": false,
      "following": null,
      "follow_request_sent": null,
      "notifications": null
    },
    "geo": null,
    "coordinates": null,
    "place": null,
    "contributors": null,
    "is_quote_status": false,
    "quote_count": 0,
    "reply_count": 0,
    "retweet_count": 0,
    "favorite_count": 0,
    "entities": {
      "hashtags": [],
      "urls": [],
      "user_mentions": [],
      "symbols": [],
      "media": [
        {
          "id": 1254022759824461824,
          "id_str": "1254022759824461824",
          "indices": [
            45,
            68
          ],
          "media_url": "http://pbs.twimg.com/media/EWcvDlOXkAAznwM.jpg",
          "media_url_https": "https://pbs.twimg.com/media/EWcvDlOXkAAznwM.jpg",
          "url": "https://t.co/O0Y7B3Koj4",
          "display_url": "pic.twitter.com/O0Y7B3Koj4",
          "expanded_url": "https://twitter.com/B_King69/status/1254022770679320576/photo/1",
          "type": "photo",
#           "sizes": {
#             "thumb": {
#               "w": 150,
#               "h": 150,
#               "resize": "crop"
#             },
#             "small": {
#               "w": 345,
#               "h": 680,
#               "resize": "fit"
#             },
#             "medium": {
#               "w": 608,
#               "h": 1200,
#               "resize": "fit"
#             },
#             "large": {
#               "w": 1038,
#               "h": 2048,
#               "resize": "fit"
#             }
#           }
        }
      ]
    },
#     "extended_entities": {
#       "media": [
#         {
#           "id": 1254022759824461824,
#           "id_str": "1254022759824461824",
#           "indices": [
#             45,
#             68
#           ],
#           "media_url": "http://pbs.twimg.com/media/EWcvDlOXkAAznwM.jpg",
#           "media_url_https": "https://pbs.twimg.com/media/EWcvDlOXkAAznwM.jpg",
#           "url": "https://t.co/O0Y7B3Koj4",
#           "display_url": "pic.twitter.com/O0Y7B3Koj4",
#           "expanded_url": "https://twitter.com/B_King69/status/1254022770679320576/photo/1",
#           "type": "photo",
#           "sizes": {
#             "thumb": {
#               "w": 150,
#               "h": 150,
#               "resize": "crop"
#             },
#             "small": {
#               "w": 345,
#               "h": 680,
#               "resize": "fit"
#             },
#             "medium": {
#               "w": 608,
#               "h": 1200,
#               "resize": "fit"
#             },
#             "large": {
#               "w": 1038,
#               "h": 2048,
#               "resize": "fit"
#             }
#           }
#         }
#       ]
#     },
    "favorited": false,
    "retweeted": false,
    "possibly_sensitive": false,
    "filter_level": "low",
    "lang": "pt",
    "timestamp_ms": "1587817301803"
  },
]