diff --git a/.gitignore b/.gitignore index 6156808..fe6fae2 100644 --- a/.gitignore +++ b/.gitignore @@ -53,3 +53,4 @@ archive grequests *.bak *.json +screenlog* diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 73b9138..d87e290 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ - id: check-yaml - id: end-of-file-fixer - id: flake8 - args: [--max-line-length=140] + args: [--max-line-length=256] - id: trailing-whitespace - repo: git://github.com/ivanlei/pre-commit-python-sorter diff --git a/README.md b/README.md index b9f8f51..b3b3e1b 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ [![Stories in In Progress](https://badge.waffle.io/krmaxwell/maltrieve.png?label=in%20progress&title=In%20Progress)](https://waffle.io/krmaxwell/maltrieve) [![Circle CI](https://circleci.com/gh/krmaxwell/maltrieve/tree/dev.svg?style=svg)](https://circleci.com/gh/krmaxwell/maltrieve/tree/dev) [![Coverage Status](https://coveralls.io/repos/krmaxwell/maltrieve/badge.svg?branch=dev)](https://coveralls.io/r/krmaxwell/maltrieve?branch=dev) +[![Code Health](https://landscape.io/github/krmaxwell/maltrieve/dev/landscape.svg?style=flat)](https://landscape.io/github/krmaxwell/maltrieve/dev) + ``` _______ _______ _______ ______ _____ _______ _ _ _______ | | | |_____| | | |_____/ | |______ \ / |______ @@ -48,6 +50,8 @@ Maltrieve requires the following dependencies: With the exception of the Python header files, these can all be found in [requirements.txt](./requirements.txt). On Debian-based distributions, run `sudo apt-get install python-dev`. On Red Hat-based distributions, run `sudo yum install python-devel`. After that, just `pip install -e .`. You may need to prepend that with ```sudo``` if not running in a virtual environment, but using such an environment is highly encouraged. +Alternately, avoid all of that by using the [Docker image](https://registry.hub.docker.com/u/technoskald/maltrieve/) + ## Usage __Basic execution:__ `maltrieve` (if installed normally) or ```python maltrieve.py``` (if just downloaded and run) @@ -66,6 +70,7 @@ optional arguments: Define file for logging progress -x, --vxcage Dump the files to a VxCage instance -v, --viper Dump the files to a Viper instance + -r, --crits Dump the file and domain to a CRITs instance -c, --cuckoo Enable Cuckoo analysis -s, --sort_mime Sort files by MIME type diff --git a/docker/Dockerfile b/docker/Dockerfile index 0f38d6e..195affd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,8 +1,8 @@ # # This Docker image encapsulates Maltrieve, a tool to retrieve malware # directly from the source for security researchers. -# which was created by Kyle Maxwell (technoskald) and is -# available at https://github.com/technoskald/maltrieve. +# which was created by Kyle Maxwell (krmaxwell) and is +# available at https://github.com/krmaxwell/maltrieve. # # The file below is based on ideas from Spenser Reinhardt's Dockerfile # (https://registry.hub.docker.com/u/sreinhardt/honeynet/dockerfile) @@ -10,38 +10,40 @@ # # To run this image after installing Docker, use a command like this: # -# sudo docker run --rm -it technoskald/maltrieve bash -# -# then run ./maltrieve.py with the desired parameters. +# sudo docker run --rm -it technoskald/maltrieve FROM ubuntu:14.04 MAINTAINER Michael Boman USER root RUN apt-get update && \ - apt-get install -y --no-install-recommends \ + apt-get dist-upgrade -y +RUN apt-get install -y --no-install-recommends \ gcc \ git \ libpython2.7-stdlib \ python2.7 \ python2.7-dev \ python-pip \ - python-setuptools && \ - - rm -rf /var/lib/apt/lists/* && \ - + python-setuptools +RUN rm -rf /var/lib/apt/lists/* && \ + pip install --upgrade pip && \ groupadd -r maltrieve && \ useradd -r -g maltrieve -d /home/maltrieve -s /sbin/nologin -c "Maltrieve User" maltrieve WORKDIR /home -RUN git clone https://github.com/technoskald/maltrieve.git && \ +RUN git clone https://github.com/krmaxwell/maltrieve.git && \ cd maltrieve && \ - pip install -r requirements.txt && \ + git checkout dev && \ + pip install -e . && \ chown -R maltrieve:maltrieve /home/maltrieve +RUN mkdir /archive && \ + chown maltrieve:maltrieve /archive + USER maltrieve ENV HOME /home/maltrieve ENV USER maltrieve WORKDIR /home/maltrieve -CMD ["./maltrieve.py"] - +ENTRYPOINT ["maltrieve"] +CMD ["-d", "/archive/samples", "-l", "/archive/maltrieve.log"] diff --git a/maltrieve.cfg b/maltrieve.cfg index add59cf..f4703df 100644 --- a/maltrieve.cfg +++ b/maltrieve.cfg @@ -7,7 +7,10 @@ User-Agent = Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0) #viper = http://127.0.0.1:8080 #cuckoo = http://127.0.0.1:8090 #vxcage = http://127.0.0.1:8080 - +#crits = https://127.0.0.1 +#crits_user = maltrieve +#crits_key = +#crits_source = maltrieve # Filter Lists are based on mime type NO SPACE BETWEEN , #black_list = text/html,text/plain diff --git a/maltrieve.py b/maltrieve.py index 4520cce..82aeb79 100755 --- a/maltrieve.py +++ b/maltrieve.py @@ -20,6 +20,7 @@ import argparse import ConfigParser +import datetime import hashlib import json import logging @@ -37,7 +38,7 @@ from bs4 import BeautifulSoup -class config: +class config(object): """ Class for holding global configuration setup """ @@ -95,25 +96,34 @@ def __init__(self, args, filename='maltrieve.cfg'): try: os.makedirs(self.dumpdir) except IOError: - logging.error('Could not create {dir}, using default'.format(dir=self.dumpdir)) + logging.error('Could not create %s, using default', self.dumpdir) self.dumpdir = '/tmp/malware' try: fd, temp_path = tempfile.mkstemp(dir=self.dumpdir) except IOError: - logging.error('Could not open {dir} for writing, using default'.format(dir=self.dumpdir)) + logging.error('Could not open %s for writing, using default', self.dumpdir) self.dumpdir = '/tmp/malware' else: os.close(fd) os.remove(temp_path) - logging.info('Using {dir} as dump directory'.format(dir=self.dumpdir)) + logging.info('Using %s as dump directory', self.dumpdir) + self.logheaders = self.configp.get('Maltrieve', 'logheaders') # TODO: Merge these self.vxcage = args.vxcage or self.configp.has_option('Maltrieve', 'vxcage') self.cuckoo = args.cuckoo or self.configp.has_option('Maltrieve', 'cuckoo') self.viper = args.viper or self.configp.has_option('Maltrieve', 'viper') - self.logheaders = self.configp.get('Maltrieve', 'logheaders') + + # CRITs + if args.crits or self.configp.has_option('Maltrieve', 'crits'): + self.crits = args.crits or self.configp.get('Maltrieve', 'crits') + self.crits_user = self.configp.get('Maltrieve', 'crits_user') + self.crits_key = self.configp.get('Maltrieve', 'crits_key') + self.crits_source = self.configp.get('Maltrieve', 'crits_source') + else: + self.crits = False def check_proxy(self): if self.proxy: @@ -123,18 +133,127 @@ def check_proxy(self): print 'External sites see {ip}'.format(ip=my_ip) +def upload_crits(response, md5, cfg): + if response: + url_tag = urlparse(response.url) + mime_type = magic.from_buffer(response.content, mime=True) + files = {'filedata': (md5, response.content)} + headers = {'User-agent': 'Maltrieve'} + zip_files = ['application/zip', 'application/gzip', 'application/x-7z-compressed'] + rar_files = ['application/x-rar-compressed'] + inserted_domain = False + inserted_sample = False + + # submit domain / IP + # TODO: identify if it is a domain or IP and submit accordingly + url = "{srv}/api/v1/domains/".format(srv=cfg.crits) + domain_data = { + 'api_key': cfg.crits_key, + 'username': cfg.crits_user, + 'source': cfg.crits_source, + 'domain': url_tag.netloc + } + try: + # Note that this request does NOT go through proxies + logging.debug("Domain submission: %s|%r", url, domain_data) + domain_response = requests.post(url, headers=headers, data=domain_data) + # pylint says "Instance of LookupDict has no 'ok' member" + if domain_response.status_code == requests.codes.ok: + domain_response_data = domain_response.json() + if domain_response_data['return_code'] == 0: + inserted_domain = True + else: + logging.info("Submitted domain info %s for %s to CRITs, response was %s", + domain_data['domain'], md5, domain_response_data) + else: + logging.info("Submission of %s failed: %d", url, domain_response.status_code) + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting domain %s", domain_data['domain']) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting domain %s", domain_data['domain']) + except requests.HTTPError: + logging.info("HTTP error when submitting domain %s to CRITs", domain_data['domain']) + + # Submit sample + url = "{srv}/api/v1/samples/".format(srv=cfg.crits) + if mime_type in zip_files: + file_type = 'zip' + elif mime_type in rar_files: + file_type = 'rar' + else: + file_type = 'raw' + sample_data = { + 'api_key': cfg.crits_key, + 'username': cfg.crits_user, + 'source': cfg.crits_source, + 'upload_type': 'file', + 'md5': md5, + 'file_format': file_type # must be type zip, rar, or raw + } + try: + # Note that this request does NOT go through proxies + sample_response = requests.post(url, headers=headers, files=files, data=sample_data, verify=False) + # pylint says "Instance of LookupDict has no 'ok' member" + if sample_response.status_code == requests.codes.ok: + sample_response_data = sample_response.json() + if sample_response_data['return_code'] == 0: + inserted_sample = True + else: + logging.info("Submitted sample %s to CRITs, response was %r", md5, sample_response_data) + else: + logging.info("Submission of sample %s failed: %d}", md5, sample_response.status_code) + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting sample %s", md5) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting sample %s", md5) + except requests.HTTPError: + logging.info("HTTP error when submitting sample %s to CRITs", md5) + + # Create a relationship for the sample and domain + url = "{srv}/api/v1/relationships/".format(srv=cfg.crits) + if (inserted_sample and inserted_domain): + relationship_data = { + 'api_key': cfg.crits_key, + 'username': cfg.crits_user, + 'source': cfg.crits_source, + 'right_type': domain_response_data['type'], + 'right_id': domain_response_data['id'], + 'left_type': sample_response_data['type'], + 'left_id': sample_response_data['id'], + 'rel_type': 'Downloaded_From', + 'rel_confidence': 'high', + 'rel_date': datetime.datetime.now() + } + try: + # Note that this request does NOT go through proxies + relationship_response = requests.post(url, headers=headers, data=relationship_data, verify=False) + # pylint says "Instance of LookupDict has no 'ok' member" + if relationship_response.status_code != requests.codes.ok: + logging.info("Submitted relationship info for %s to CRITs, response was %r", + md5, domain_response_data) + except requests.ConnectionError: + logging.info("Could not connect to CRITs when submitting relationship for sample %s", md5) + except requests.ConnectTimeout: + logging.info("Timed out connecting to CRITs when submitting relationship for sample %s", md5) + except requests.HTTPError: + logging.info("HTTP error when submitting relationship for sample %s to CRITs", md5) + return True + else: + return False + + def upload_vxcage(response, md5, cfg): if response: url_tag = urlparse(response.url) files = {'file': (md5, response.content)} tags = {'tags': url_tag.netloc + ',Maltrieve'} - url = "{srv}/malware/add".format(cfg.vxcage) + url = "{srv}/malware/add".format(srv=cfg.vxcage) headers = {'User-agent': 'Maltrieve'} try: # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted {md5} to VxCage, response was {msg}".format(md5=md5, msg=response_data["message"])) + logging.info("Submitted %s to VxCage, response was %d", md5, response_data["message"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to VxCage, will attempt local storage") return False @@ -151,7 +270,7 @@ def upload_cuckoo(response, md5, cfg): try: response = requests.post(url, headers=headers, data=data) response_data = response.json() - logging.info("Submitted {md5} to Cuckoo, task ID {taskid}".format(md5=md5, taskid=response_data["task_id"])) + logging.info("Submitted %s to Cuckoo, task ID %d", md5, response_data["task_id"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to Cuckoo, will attempt local storage") return False @@ -170,7 +289,7 @@ def upload_viper(response, md5, cfg): # Note that this request does NOT go through proxies response = requests.post(url, headers=headers, files=files, data=tags) response_data = response.json() - logging.info("Submitted {md5} to Viper, response was {msg}".format(md5=md5, msg=response_data["message"])) + logging.info("Submitted %s to Viper, response was %s", md5, response_data["message"]) except requests.exceptions.ConnectionError: logging.info("Could not connect to Viper, will attempt local storage") return False @@ -183,22 +302,23 @@ def save_malware(response, cfg): data = response.content mime_type = magic.from_buffer(data, mime=True) if mime_type in cfg.black_list: - logging.info('{mtype} in ignore list for {url}'.format(mtype=mime_type, url=url)) + logging.info('%s in ignore list for %s', mime_type, url) return if cfg.white_list: if mime_type in cfg.white_list: pass else: - logging.info('{mtype} not in whitelist for {url}'.format(mtype=mime_type, url=url)) + logging.info('%s not in whitelist for %s', mime_type, url) return # Hash and log md5 = hashlib.md5(data).hexdigest() - logging.info("{url} hashes to {md5}".format(url=url, md5=md5)) + logging.info("%s hashes to %s", url, md5) # Assume that external repo means we don't need to write to file as well. stored = False # Submit to external services + # TODO: merge these if cfg.vxcage: stored = upload_vxcage(response, md5, cfg) or stored @@ -206,6 +326,8 @@ def save_malware(response, cfg): stored = upload_cuckoo(response, md5, cfg) or stored if cfg.viper: stored = upload_viper(response, md5, cfg) or stored + if cfg.crits: + stored = upload_crits(response, md5, cfg) or stored # else save to disk if not stored: if cfg.sort_mime: @@ -218,7 +340,7 @@ def save_malware(response, cfg): store_path = os.path.join(cfg.dumpdir, md5) with open(store_path, 'wb') as f: f.write(data) - logging.info("Saved {md5} to dump dir".format(md5=md5)) + logging.info("Saved %s to dump dir", md5) return True @@ -273,12 +395,15 @@ def setup_args(args): help="Define dump directory for retrieved files") parser.add_argument("-l", "--logfile", help="Define file for logging progress") - parser.add_argument("-x", "--vxcage", - help="Dump the files to a VxCage instance", + parser.add_argument("-r", "--crits", + help="Dump the file to a Crits instance.", action="store_true", default=False) parser.add_argument("-v", "--viper", help="Dump the files to a Viper instance", action="store_true", default=False) + parser.add_argument("-x", "--vxcage", + help="Dump the file to a VxCage instance", + action="store_true", default=False) parser.add_argument("-c", "--cuckoo", help="Enable Cuckoo analysis", action="store_true", default=False) parser.add_argument("-s", "--sort_mime", @@ -327,7 +452,7 @@ def main(): hashes = set() past_urls = set() - args = setup_args(sys.argv) + args = setup_args(sys.argv[1:]) cfg = config(args, 'maltrieve.cfg') cfg.check_proxy() @@ -360,7 +485,7 @@ def main(): print "Downloading samples, check log for details" malware_urls -= past_urls - reqs = [grequests.get(url, headers=headers, proxies=cfg.proxy) for url in malware_urls] + reqs = [grequests.get(url, timeout=60, headers=headers, proxies=cfg.proxy) for url in malware_urls] for chunk in chunker(reqs, 32): malware_downloads = grequests.map(chunk) for each in malware_downloads: diff --git a/setup.py b/setup.py index 5911a1e..aaafd38 100644 --- a/setup.py +++ b/setup.py @@ -2,21 +2,21 @@ from distutils.core import setup setup(name='maltrieve', - version='0.6', + version='0.7', description="A tool to retrieve malware directly from the source for security researchers.", author='Kyle Maxwell', author_email='krmaxwell@gmail.com', url='http://maltrieve.org', install_requires=[ - 'argparse==1.2.1', - 'beautifulsoup4==4.3.2', - 'feedparser==5.1.3', - 'gevent==1.0.1', - 'greenlet==0.4.2', - 'grequests==0.2.0', - 'python-magic==0.4.6', - 'requests==2.3.0', - 'wsgiref==0.1.2', + 'argparse>=1.2.1', + 'beautifulsoup4>=4.3.2', + 'feedparser>=5.1.3', + 'gevent>=1.0.1', + 'greenlet>=0.4.2', + 'grequests>=0.2.0', + 'python-magic>=0.4.6', + 'requests>=2.3.0', + 'wsgiref>=0.1.2', 'pre-commit', 'pytest', 'pytest-cov', diff --git a/src/MultiPartForm.py b/src/MultiPartForm.py deleted file mode 100644 index 0af6d96..0000000 --- a/src/MultiPartForm.py +++ /dev/null @@ -1,69 +0,0 @@ -import itertools -import mimetools -import mimetypes -import urllib -import urllib2 - - -class MultiPartForm(object): - """Accumulate the data to be used when posting a form.""" - - def __init__(self): - self.form_fields = [] - self.files = [] - self.boundary = mimetools.choose_boundary() - return - - def get_content_type(self): - return 'multipart/form-data; boundary=%s' % self.boundary - - def add_field(self, name, value): - """Add a simple field to the form data.""" - self.form_fields.append((name, value)) - return - - def add_file(self, fieldname, filename, fileHandle, mimetype=None): - """Add a file to be uploaded.""" - body = fileHandle.read() - if mimetype is None: - mimetype = mimetypes.guess_type(filename)[0] or 'application/octet-stream' - self.files.append((fieldname, filename, mimetype, body)) - return - - def __str__(self): - """Return a string representing the form data, including attached files.""" - # Build a list of lists, each containing "lines" of the - # request. Each part is separated by a boundary string. - # Once the list is built, return a string where each - # line is separated by '\r\n'. - parts = [] - part_boundary = '--' + self.boundary - - # Add the form fields - parts.extend( - [ part_boundary, - 'Content-Disposition: form-data; name="%s"' % name, - '', - value, - ] - for name, value in self.form_fields - ) - - # Add the files to upload - parts.extend( - [ part_boundary, - 'Content-Disposition: file; name="%s"; filename="%s"' % \ - (field_name, filename), - 'Content-Type: %s' % content_type, - '', - body, - ] - for field_name, filename, content_type, body in self.files - ) - - # Flatten the list and add closing boundary marker, - # then return CR+LF separated data - flattened = list(itertools.chain(*parts)) - flattened.append('--' + self.boundary + '--') - flattened.append('') - return '\r\n'.join(flattened) diff --git a/src/__init__.py b/src/__init__.py index a3591fe..e69de29 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,2 +0,0 @@ -''' This file is part of maltrieve. See LICENSE for license details. ''' - diff --git a/src/maltrieve.py b/src/maltrieve.py deleted file mode 100755 index 76a30b4..0000000 --- a/src/maltrieve.py +++ /dev/null @@ -1,277 +0,0 @@ -# Copyright 2013 Kyle Maxwell -# Includes code from mwcrawler, (c) 2012 Ricardo Dias. Used under license. - -# Maltrieve - retrieve malware from the source - -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see